From bd86e2f1bf94dfb9b67b301451a5abb079ba8e26 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 10 Mar 2023 10:49:09 +0800
Subject: [PATCH 01/54] [feat]: add sequence encoding module

---
 .../python/feature_column/feature_column.py   |  12 ++
 easy_rec/python/layers/cmbf.py                |   3 +-
 easy_rec/python/layers/input_layer.py         |  21 ++-
 easy_rec/python/layers/sequence_encoder.py    | 159 ++++++++++++++++++
 easy_rec/python/layers/uniter.py              |   3 +-
 easy_rec/python/model/dbmtl.py                |  19 +++
 easy_rec/python/model/easy_rec_model.py       |  21 +++
 easy_rec/python/protos/dbmtl.proto            |   6 +
 easy_rec/python/protos/feature_config.proto   |   2 +
 easy_rec/python/protos/layer.proto            |  36 ++++
 easy_rec/python/test/train_eval_test.py       |   2 +-
 easy_rec/version.py                           |   2 +-
 setup.cfg                                     |   2 +-
 13 files changed, 279 insertions(+), 9 deletions(-)
 create mode 100644 easy_rec/python/layers/sequence_encoder.py

diff --git a/easy_rec/python/feature_column/feature_column.py b/easy_rec/python/feature_column/feature_column.py
index 94a9cd132..04fc07baf 100644
--- a/easy_rec/python/feature_column/feature_column.py
+++ b/easy_rec/python/feature_column/feature_column.py
@@ -86,6 +86,8 @@ def _cmp_embed_config(a, b):
             'shared embed info of [%s] is not matched [%s] vs [%s]' % (
                 embed_name, config, self._share_embed_infos[embed_name])
         self._share_embed_names[embed_name] += 1
+        if config.feature_type == FeatureConfig.FeatureType.SequenceFeature:
+          self._share_embed_infos[embed_name] = copy_obj(config)
       else:
         self._share_embed_names[embed_name] = 1
         self._share_embed_infos[embed_name] = copy_obj(config)
@@ -156,6 +158,11 @@ def _cmp_embed_config(a, b):
             combiner=self._share_embed_infos[embed_name].combiner,
             partitioner=partitioner,
             ev_params=ev_params)
+        config = self._share_embed_infos[embed_name]
+        max_seq_len = config.max_seq_len if config.HasField(
+            'max_seq_len') else -1
+        for fc in share_embed_fcs:
+          fc.max_seq_length = max_seq_len
         self._deep_share_embed_columns[embed_name] = share_embed_fcs
 
       # for handling wide share embedding columns
@@ -168,6 +175,11 @@ def _cmp_embed_config(a, b):
             combiner='sum',
             partitioner=partitioner,
             ev_params=ev_params)
+        config = self._share_embed_infos[embed_name]
+        max_seq_len = config.max_seq_len if config.HasField(
+            'max_seq_len') else -1
+        for fc in share_embed_fcs:
+          fc.max_seq_length = max_seq_len
         self._wide_share_embed_columns[embed_name] = share_embed_fcs
 
     for fc_name in self._deep_columns:
diff --git a/easy_rec/python/layers/cmbf.py b/easy_rec/python/layers/cmbf.py
index 2c6ed8444..b42ddfd30 100644
--- a/easy_rec/python/layers/cmbf.py
+++ b/easy_rec/python/layers/cmbf.py
@@ -33,7 +33,8 @@ def __init__(self, model_config, feature_configs, features, cmbf_config,
       has_feature = True
     self._txt_seq_features = None
     if input_layer.has_group('text'):
-      self._txt_seq_features = input_layer(features, 'text', is_combine=False)
+      self._txt_seq_features, _, _ = input_layer(
+          features, 'text', is_combine=False)
       has_feature = True
     self._other_features = None
     if input_layer.has_group('other'):  # e.g. statistical feature
diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py
index 731f47c82..fa17a1c15 100644
--- a/easy_rec/python/layers/input_layer.py
+++ b/easy_rec/python/layers/input_layer.py
@@ -123,12 +123,25 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
       group_columns, group_seq_columns = feature_group.select_columns(
           self._fc_parser)
 
-      assert len(group_columns) == 0, \
-          'there are none sequence columns: %s' % str(group_columns)
+      embedding_reg_lst = []
+      output_features = None
+      group_features = []
+      if group_columns:
+        cols_to_output_tensors = OrderedDict()
+        output_features = feature_column.input_layer(
+            features,
+            group_columns,
+            cols_to_output_tensors=cols_to_output_tensors,
+            feature_name_to_output_tensors=feature_name_to_output_tensors)
+        group_features = [cols_to_output_tensors[x] for x in group_columns]
+
+        for col, val in cols_to_output_tensors.items():
+          if isinstance(col, EmbeddingColumn) or isinstance(
+              col, SharedEmbeddingColumn):
+            embedding_reg_lst.append(val)
 
       builder = feature_column._LazyBuilder(features)
       seq_features = []
-      embedding_reg_lst = []
       for fc in group_seq_columns:
         with variable_scope.variable_scope('input_layer/' +
                                            fc.categorical_column.name):
@@ -140,7 +153,7 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
           embedding_reg_lst.append(tmp_embedding)
       regularizers.apply_regularization(
           self._embedding_regularizer, weights_list=embedding_reg_lst)
-      return seq_features
+      return seq_features, output_features, group_features
 
   def single_call_input_layer(self,
                               features,
diff --git a/easy_rec/python/layers/sequence_encoder.py b/easy_rec/python/layers/sequence_encoder.py
new file mode 100644
index 000000000..07c339890
--- /dev/null
+++ b/easy_rec/python/layers/sequence_encoder.py
@@ -0,0 +1,159 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+
+import tensorflow as tf
+
+from easy_rec.python.compat import regularizers
+from easy_rec.python.layers import dnn
+from easy_rec.python.layers import multihead_cross_attention
+from easy_rec.python.utils.shape_utils import get_shape_list
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class SequenceEncoder(object):
+
+  def __init__(self, input_layer, feature_groups_config, emb_reg, l2_reg):
+    self._input_layer = input_layer
+    self._feature_groups_config = {
+        x.group_name: x for x in feature_groups_config
+    }
+    self._emb_reg = emb_reg
+    self._l2_reg = l2_reg
+
+  def __call__(self, features, group_name, is_training=True, *args, **kwargs):
+    group_config = self._feature_groups_config[group_name]
+    if len(group_config.sequence_encoders) == 0:
+      return None
+
+    seq_features, target_feature, target_features = self._input_layer(
+        features, group_name, is_combine=False)
+    assert len(
+        seq_features) > 0, 'sequence feature is empty in group: ' + group_name
+
+    outputs = []
+    for encoder in group_config.sequence_encoders:
+      encoder_type = encoder.WhichOneof('encoder').lower()
+      if encoder_type == 'bst':
+        encoding = self.bst_encoder(seq_features, target_feature, group_name,
+                                    encoder.bst)
+        outputs.append(encoding)
+      elif encoder_type == 'din':
+        encoding = self.din_encoder(seq_features, target_feature, group_name,
+                                    encoder.din, is_training)
+        outputs.append(encoding)
+      else:
+        assert False, 'unsupported sequence encode type: ' + encoder_type
+
+    if len(outputs) == 0:
+      logging.warning(
+          "there's no sequence encoder configured in feature group: " +
+          group_name)
+      return None
+    if len(outputs) == 1:
+      return outputs[0]
+
+    return tf.concat(outputs, axis=-1)
+
+  def din_encoder(self, seq_features, target_feature, group_name, config,
+                  is_training):
+    seq_input = [seq_fea for seq_fea, _ in seq_features]
+    regularizers.apply_regularization(self._emb_reg, weights_list=seq_input)
+    keys = tf.concat(seq_input, axis=-1)
+
+    target_emb_size = target_feature.shape.as_list()[-1]
+    seq_emb_size = keys.shape.as_list()[-1]
+    assert target_emb_size == seq_emb_size, 'the embedding size of sequence and target item is not equal' \
+                                            ' in feature group:' + group_name
+
+    batch_size, max_seq_len, _ = get_shape_list(keys, 3)
+    queries = tf.tile(tf.expand_dims(target_feature, 1), [1, max_seq_len, 1])
+    din_all = tf.concat([queries, keys, queries - keys, queries * keys],
+                        axis=-1)
+    din_layer = dnn.DNN(
+        config.dnn,
+        self._l2_reg,
+        group_name + '/din_attention',
+        is_training,
+        last_layer_no_activation=True,
+        last_layer_no_batch_norm=True)
+    output = din_layer(din_all)  # [B, L, 1]
+    scores = tf.transpose(output, [0, 2, 1])  # [B, 1, L]
+
+    seq_len = seq_features[0][1]
+    seq_mask = tf.sequence_mask(seq_len, max_seq_len, dtype=tf.bool)
+    seq_mask = tf.expand_dims(seq_mask, 1)
+    paddings = tf.ones_like(scores) * (-2**32 + 1)
+    scores = tf.where(seq_mask, scores, paddings)  # [B, 1, L]
+    scores = scores / (seq_emb_size**0.5)
+    # normalization with softmax is abandoned according to the original paper
+    scores = tf.nn.sigmoid(scores)
+    output = tf.squeeze(tf.matmul(scores, keys))
+    return output
+
+  def bst_encoder(self, seq_features, target_feature, group_name, config):
+    seq_embeds = [seq_fea for seq_fea, _ in seq_features]
+    regularizers.apply_regularization(self._emb_reg, weights_list=seq_embeds)
+
+    max_position = config.max_position_embeddings
+    batch_size, max_seq_len, _ = get_shape_list(seq_features[0][0], 3)
+    valid_len = tf.assert_less_equal(
+        max_seq_len,
+        max_position,
+        message='sequence length is greater than `max_position_embeddings`:' +
+        str(max_position) + ' in feature group:' + group_name)
+    with tf.control_dependencies([valid_len]):
+      # seq_input: [batch_size, seq_len, embed_size]
+      seq_input = tf.concat(seq_embeds, axis=-1)
+
+    # seq_len: [batch_size, ], 假设每个sequence feature的length都是相同的
+    seq_len = seq_features[0][1]
+    seq_embed_size = seq_input.shape.as_list()[-1]
+    if target_feature is not None:
+      target_size = target_feature.shape.as_list()[-1]
+      assert seq_embed_size == target_size, 'the embedding size of sequence and target item is not equal' \
+                                            ' in feature group:' + group_name
+      # target_feature: [batch_size, 1, embed_size]
+      target_feature = tf.expand_dims(target_feature, 1)
+      # seq_input: [batch_size, seq_len+1, embed_size]
+      seq_input = tf.concat([target_feature, seq_input], axis=1)
+      max_seq_len += 1
+      seq_len += 1
+
+    if seq_embed_size != config.hidden_size:
+      seq_input = tf.layers.dense(seq_input, config.hidden_size)
+
+    seq_fea = multihead_cross_attention.embedding_postprocessor(
+        seq_input,
+        position_embedding_name=group_name + '_position_embeddings',
+        max_position_embeddings=max_position)
+    seq_mask = tf.map_fn(
+        fn=lambda t: dynamic_mask(t, max_seq_len), elems=tf.to_int32(seq_len))
+    attention_mask = multihead_cross_attention.create_attention_mask_from_input_mask(
+        from_tensor=seq_fea, to_mask=seq_mask)
+
+    hidden_act = multihead_cross_attention.get_activation(config.hidden_act)
+    attention_fea = multihead_cross_attention.transformer_encoder(
+        seq_fea,
+        hidden_size=config.hidden_size,
+        num_hidden_layers=config.num_hidden_layers,
+        num_attention_heads=config.num_attention_heads,
+        attention_mask=attention_mask,
+        intermediate_size=config.intermediate_size,
+        intermediate_act_fn=hidden_act,
+        hidden_dropout_prob=config.hidden_dropout_prob,
+        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+        initializer_range=config.initializer_range,
+        name=group_name +
+        '/bst')  # shape: [batch_size, seq_length, hidden_size]
+
+    out_fea = attention_fea[:, 0, :]  # target feature
+    return out_fea
+
+
+def dynamic_mask(x, max_len):
+  ones = tf.ones(shape=tf.stack([x]), dtype=tf.int32)
+  zeros = tf.zeros(shape=tf.stack([max_len - x]), dtype=tf.int32)
+  return tf.concat([ones, zeros], axis=0)
diff --git a/easy_rec/python/layers/uniter.py b/easy_rec/python/layers/uniter.py
index 96b9cdc46..248afc1a9 100644
--- a/easy_rec/python/layers/uniter.py
+++ b/easy_rec/python/layers/uniter.py
@@ -31,7 +31,8 @@ def __init__(self, model_config, feature_configs, features, uniter_config,
       tower_num += 1
     self._txt_seq_features = None
     if input_layer.has_group('text'):
-      self._txt_seq_features = input_layer(features, 'text', is_combine=False)
+      self._txt_seq_features, _, _ = input_layer(
+          features, 'text', is_combine=False)
       tower_num += 1
     self._use_token_type = True if tower_num > 1 else False
     self._other_features = None
diff --git a/easy_rec/python/model/dbmtl.py b/easy_rec/python/model/dbmtl.py
index 913793474..3639bf029 100644
--- a/easy_rec/python/model/dbmtl.py
+++ b/easy_rec/python/model/dbmtl.py
@@ -42,6 +42,20 @@ def __init__(self,
     self._init_towers(self._model_config.task_towers)
 
   def build_predict_graph(self):
+    if self._model_config.use_input_batch_norm:
+      self._features = tf.layers.batch_normalization(
+          self._features,
+          training=self._is_training,
+          trainable=True,
+          name='input_bn')
+    if self._model_config.HasField('input_dropout_rate'):
+      drop_rate = self._model_config.input_dropout_rate
+      self._features = tf.layers.dropout(
+          self._features,
+          rate=drop_rate,
+          training=self._is_training,
+          name='input_dropout')
+
     if self._model_config.HasField('bottom_cmbf'):
       bottom_fea = self._cmbf_layer(self._is_training, l2_reg=self._l2_reg)
     elif self._model_config.HasField('bottom_uniter'):
@@ -56,6 +70,11 @@ def build_predict_graph(self):
     else:
       bottom_fea = self._features
 
+    if self._model_config.use_sequence_encoder:
+      seq_encoding = self.get_sequence_encoding(is_training=self._is_training)
+      if seq_encoding is not None:
+        bottom_fea = tf.concat([bottom_fea, seq_encoding], axis=-1)
+
     # MMOE block
     if self._model_config.HasField('expert_dnn'):
       mmoe_layer = mmoe.MMOE(
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index 912291987..eff1af32a 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -12,6 +12,7 @@
 
 from easy_rec.python.compat import regularizers
 from easy_rec.python.layers import input_layer
+from easy_rec.python.layers.sequence_encoder import SequenceEncoder
 from easy_rec.python.utils import constant
 from easy_rec.python.utils import estimator_utils
 from easy_rec.python.utils import restore_filter
@@ -60,6 +61,10 @@ def __init__(self,
     if constant.SAMPLE_WEIGHT in features:
       self._sample_weight = features[constant.SAMPLE_WEIGHT]
 
+    self._sequence_encoder = SequenceEncoder(self._input_layer,
+                                             model_config.feature_groups,
+                                             self._emb_reg, self._l2_reg)
+
   @property
   def embedding_regularization(self):
     return self._base_model_config.embedding_regularization
@@ -99,6 +104,22 @@ def build_input_layer(self, model_config, feature_configs):
         if model_config.HasField('variational_dropout') else None,
         is_training=self._is_training)
 
+  def get_sequence_encoding(self, group_name=None, is_training=True):
+    if group_name is None:
+      seq_encoding = []
+      for group in self.feature_groups:
+        if len(group.sequence_encoders) == 0:
+          continue
+        encoding = self.get_sequence_encoding(group.group_name,
+                                              self._is_training)
+        if encoding is not None:
+          seq_encoding.append(encoding)
+      if seq_encoding:
+        return tf.concat(seq_encoding, axis=-1)
+      else:
+        return None
+    return self._sequence_encoder(self._feature_dict, group_name, is_training)
+
   @abstractmethod
   def build_predict_graph(self):
     pass
diff --git a/easy_rec/python/protos/dbmtl.proto b/easy_rec/python/protos/dbmtl.proto
index 841b8adec..2b1f981aa 100644
--- a/easy_rec/python/protos/dbmtl.proto
+++ b/easy_rec/python/protos/dbmtl.proto
@@ -20,4 +20,10 @@ message DBMTL {
     repeated BayesTaskTower task_towers = 4;
     // l2 regularization
     optional float l2_regularization = 5 [default = 1e-4];
+    // Whether to user sequence encoder
+    required bool use_sequence_encoder = 6 [default = false];
+    // Whether to user sequence encoder
+    required bool use_input_batch_norm = 7 [default = false];
+    // input layer dropout rate
+    optional float input_dropout_rate = 8 [default = 0];
 }
diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto
index 596e87e4d..5ed305c10 100644
--- a/easy_rec/python/protos/feature_config.proto
+++ b/easy_rec/python/protos/feature_config.proto
@@ -3,6 +3,7 @@ package protos;
 
 import "easy_rec/python/protos/hyperparams.proto";
 import "easy_rec/python/protos/dnn.proto";
+import "easy_rec/python/protos/layer.proto";
 enum WideOrDeep {
     DEEP = 0;
     WIDE = 1;
@@ -140,6 +141,7 @@ message FeatureGroupConfig {
     optional WideOrDeep wide_deep = 3 [default = DEEP];
     repeated SeqAttGroupConfig sequence_features = 4;
     optional bool negative_sampler = 5 [default = false];
+    repeated SequenceEncoder sequence_encoders = 6;
 }
 
 message SeqAttMap {
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index 6cea6d3bd..482c5241f 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -74,3 +74,39 @@ message UniterTower {
     // dnn layers for other features
     optional DNN other_feature_dnn = 11;
 }
+
+message SequenceEncoder {
+    // encoder parameters
+    oneof encoder {
+        BSTEncoder bst = 1;
+        DINEncoder din = 2;
+    }
+}
+
+message BSTEncoder {
+    // Size of the encoder layers and the pooler layer
+    required uint32 hidden_size = 1;
+    // Number of hidden layers in the Transformer encoder
+    required uint32 num_hidden_layers = 2;
+    // Number of attention heads for each attention layer in the Transformer encoder
+    required uint32 num_attention_heads = 3;
+    // The size of the "intermediate" (i.e. feed-forward) layer in the Transformer encoder
+    required uint32 intermediate_size = 4;
+    // The non-linear activation function (function or string) in the encoder and pooler.
+    required string hidden_act = 5 [default = 'gelu'];  // "gelu", "relu", "tanh" and "swish" are supported.
+    // The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
+    required float hidden_dropout_prob = 6 [default = 0.1];
+    // The dropout ratio for the attention probabilities
+    required float attention_probs_dropout_prob = 7 [default = 0.1];
+    // The maximum sequence length that this model might ever be used with
+    required uint32 max_position_embeddings = 8 [default = 512];
+    // Whether to add position embeddings for the position of each token in the text sequence
+    required bool use_position_embeddings = 9 [default = true];
+    // The stddev of the truncated_normal_initializer for initializing all weight matrices
+    required float initializer_range = 10 [default = 0.02];
+}
+
+message DINEncoder {
+    // din attention layer
+    required DNN dnn = 1;
+}
diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py
index 5fca892b2..57c1d79bd 100644
--- a/easy_rec/python/test/train_eval_test.py
+++ b/easy_rec/python/test/train_eval_test.py
@@ -7,11 +7,11 @@
 import threading
 import time
 import unittest
+from distutils.version import LooseVersion
 
 import numpy as np
 import six
 import tensorflow as tf
-from distutils.version import LooseVersion
 from tensorflow.python.platform import gfile
 
 from easy_rec.python.main import predict
diff --git a/easy_rec/version.py b/easy_rec/version.py
index 6e00ca21f..e4d390b71 100644
--- a/easy_rec/version.py
+++ b/easy_rec/version.py
@@ -1,3 +1,3 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.6.1'
+__version__ = '0.6.2'
diff --git a/setup.cfg b/setup.cfg
index b5b966faa..b180b9fb1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ multi_line_output = 7
 force_single_line = true
 known_standard_library = setuptools
 known_first_party = easy_rec
-known_third_party = absl,common_io,distutils,docutils,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
+known_third_party = absl,common_io,docutils,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
 no_lines_before = LOCALFOLDER
 default_section = THIRDPARTY
 skip = easy_rec/python/protos

From 2b8f2e70cbcf3d2322691be06a751c3f7a8eb93e Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 10 Mar 2023 13:53:20 +0800
Subject: [PATCH 02/54] [feat]: add sequence encoding module

---
 easy_rec/python/layers/cmbf.py                |   4 +
 easy_rec/python/layers/common_layers.py       |  17 ---
 easy_rec/python/layers/dnn.py                 |   4 +-
 .../layers/multihead_cross_attention.py       |  43 +-----
 easy_rec/python/layers/sequence_encoder.py    |  20 ++-
 easy_rec/python/layers/uniter.py              |   8 +-
 .../model/collaborative_metric_learning.py    |   2 +-
 easy_rec/python/utils/activation.py           | 127 ++++++++++++++++++
 8 files changed, 155 insertions(+), 70 deletions(-)
 create mode 100644 easy_rec/python/utils/activation.py

diff --git a/easy_rec/python/layers/cmbf.py b/easy_rec/python/layers/cmbf.py
index b42ddfd30..e5f1caeb2 100644
--- a/easy_rec/python/layers/cmbf.py
+++ b/easy_rec/python/layers/cmbf.py
@@ -326,6 +326,10 @@ def merge_text_embedding(self, txt_embeddings, input_masks):
     return txt_embeddings
 
   def __call__(self, is_training, *args, **kwargs):
+    if not is_training:
+      self._model_config.hidden_dropout_prob = 0.0
+      self._model_config.attention_probs_dropout_prob = 0.0
+
     # shape: [batch_size, image_num/image_dim, hidden_size]
     img_attention_fea = self.image_self_attention_tower()
 
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index 80ad1496f..165fce5e1 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -1,29 +1,12 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import numpy as np
 import tensorflow as tf
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
 
 
-def gelu(x):
-  """Gaussian Error Linear Unit.
-
-  This is a smoother version of the RELU.
-  Original paper: https://arxiv.org/abs/1606.08415
-  Args:
-    x: float Tensor to perform activation.
-
-  Returns:
-    `x` with the GELU activation applied.
-  """
-  cdf = 0.5 * (1.0 + tf.tanh(
-      (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-  return x * cdf
-
-
 def highway(x,
             size=None,
             activation=None,
diff --git a/easy_rec/python/layers/dnn.py b/easy_rec/python/layers/dnn.py
index 4fdce37ba..d2af5a4cf 100644
--- a/easy_rec/python/layers/dnn.py
+++ b/easy_rec/python/layers/dnn.py
@@ -4,7 +4,7 @@
 
 import tensorflow as tf
 
-from easy_rec.python.utils.load_class import load_by_path
+from easy_rec.python.utils.activation import get_activation
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -34,7 +34,7 @@ def __init__(self,
     self._name = name
     self._is_training = is_training
     logging.info('dnn activation function = %s' % self._config.activation)
-    self.activation = load_by_path(self._config.activation)
+    self.activation = get_activation(self._config.activation, is_training=is_training)
     self._last_layer_no_activation = last_layer_no_activation
     self._last_layer_no_batch_norm = last_layer_no_batch_norm
 
diff --git a/easy_rec/python/layers/multihead_cross_attention.py b/easy_rec/python/layers/multihead_cross_attention.py
index 911ff7bae..bafb7e019 100644
--- a/easy_rec/python/layers/multihead_cross_attention.py
+++ b/easy_rec/python/layers/multihead_cross_attention.py
@@ -5,13 +5,10 @@
 from __future__ import print_function
 
 import math
-
-import six
 import tensorflow as tf
-
 from easy_rec.python.compat.layers import layer_norm as tf_layer_norm
-from easy_rec.python.layers.common_layers import gelu
 from easy_rec.python.utils.shape_utils import get_shape_list
+from easy_rec.python.utils.activation import gelu
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -736,41 +733,3 @@ def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
   output_tensor = layer_norm(input_tensor, name)
   output_tensor = dropout(output_tensor, dropout_prob)
   return output_tensor
-
-
-def get_activation(activation_string):
-  """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
-
-  Args:
-    activation_string: String name of the activation function.
-
-  Returns:
-    A Python function corresponding to the activation function. If
-    `activation_string` is None, empty, or "linear", this will return None.
-    If `activation_string` is not a string, it will return `activation_string`.
-
-  Raises:
-    ValueError: The `activation_string` does not correspond to a known
-      activation.
-  """
-  # We assume that anything that's not a string is already an activation
-  # function, so we just return it.
-  if not isinstance(activation_string, six.string_types):
-    return activation_string
-
-  if not activation_string:
-    return None
-
-  act = activation_string.lower()
-  if act == 'linear':
-    return None
-  elif act == 'relu':
-    return tf.nn.relu
-  elif act == 'gelu':
-    return gelu
-  elif act == 'tanh':
-    return tf.tanh
-  elif act == 'swish':
-    return tf.nn.swish
-  else:
-    raise ValueError('Unsupported activation: %s' % act)
diff --git a/easy_rec/python/layers/sequence_encoder.py b/easy_rec/python/layers/sequence_encoder.py
index 07c339890..0d141c094 100644
--- a/easy_rec/python/layers/sequence_encoder.py
+++ b/easy_rec/python/layers/sequence_encoder.py
@@ -8,6 +8,7 @@
 from easy_rec.python.layers import dnn
 from easy_rec.python.layers import multihead_cross_attention
 from easy_rec.python.utils.shape_utils import get_shape_list
+from easy_rec.python.utils.activation import get_activation
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -93,7 +94,11 @@ def din_encoder(self, seq_features, target_feature, group_name, config,
     output = tf.squeeze(tf.matmul(scores, keys))
     return output
 
-  def bst_encoder(self, seq_features, target_feature, group_name, config):
+  def bst_encoder(self, seq_features, target_feature, group_name, config, is_training):
+    if not is_training:
+      config.hidden_dropout_prob = 0.0
+      config.attention_probs_dropout_prob = 0.0
+
     seq_embeds = [seq_fea for seq_fea, _ in seq_features]
     regularizers.apply_regularization(self._emb_reg, weights_list=seq_embeds)
 
@@ -123,7 +128,11 @@ def bst_encoder(self, seq_features, target_feature, group_name, config):
       seq_len += 1
 
     if seq_embed_size != config.hidden_size:
-      seq_input = tf.layers.dense(seq_input, config.hidden_size)
+      seq_input = tf.layers.dense(
+          seq_input,
+          config.hidden_size,
+          activation=tf.nn.relu,
+          kernel_regularizer=self._l2_reg)
 
     seq_fea = multihead_cross_attention.embedding_postprocessor(
         seq_input,
@@ -134,7 +143,7 @@ def bst_encoder(self, seq_features, target_feature, group_name, config):
     attention_mask = multihead_cross_attention.create_attention_mask_from_input_mask(
         from_tensor=seq_fea, to_mask=seq_mask)
 
-    hidden_act = multihead_cross_attention.get_activation(config.hidden_act)
+    hidden_act = get_activation(config.hidden_act)
     attention_fea = multihead_cross_attention.transformer_encoder(
         seq_fea,
         hidden_size=config.hidden_size,
@@ -146,9 +155,8 @@ def bst_encoder(self, seq_features, target_feature, group_name, config):
         hidden_dropout_prob=config.hidden_dropout_prob,
         attention_probs_dropout_prob=config.attention_probs_dropout_prob,
         initializer_range=config.initializer_range,
-        name=group_name +
-        '/bst')  # shape: [batch_size, seq_length, hidden_size]
-
+        name=group_name + '/bst')
+    # attention_fea shape: [batch_size, seq_length, hidden_size]
     out_fea = attention_fea[:, 0, :]  # target feature
     return out_fea
 
diff --git a/easy_rec/python/layers/uniter.py b/easy_rec/python/layers/uniter.py
index 248afc1a9..47ccc678c 100644
--- a/easy_rec/python/layers/uniter.py
+++ b/easy_rec/python/layers/uniter.py
@@ -5,6 +5,7 @@
 from easy_rec.python.layers import dnn
 from easy_rec.python.layers import multihead_cross_attention
 from easy_rec.python.utils.shape_utils import get_shape_list
+from easy_rec.python.utils.activation import get_activation
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -224,6 +225,10 @@ def image_embeddings(self):
     return img_fea
 
   def __call__(self, is_training, *args, **kwargs):
+    if not is_training:
+      self._model_config.hidden_dropout_prob = 0.0
+      self._model_config.attention_probs_dropout_prob = 0.0
+
     sub_modules = []
 
     img_fea = self.image_embeddings()
@@ -262,8 +267,7 @@ def __call__(self, is_training, *args, **kwargs):
       input_mask = tf.concat(masks, axis=1)
       attention_mask = multihead_cross_attention.create_attention_mask_from_input_mask(
           from_tensor=all_fea, to_mask=input_mask)
-      hidden_act = multihead_cross_attention.get_activation(
-          self._model_config.hidden_act)
+      hidden_act = get_activation(self._model_config.hidden_act)
       attention_fea = multihead_cross_attention.transformer_encoder(
           all_fea,
           hidden_size=hidden_size,
diff --git a/easy_rec/python/model/collaborative_metric_learning.py b/easy_rec/python/model/collaborative_metric_learning.py
index 84c87ccaa..7e5d7c008 100644
--- a/easy_rec/python/model/collaborative_metric_learning.py
+++ b/easy_rec/python/model/collaborative_metric_learning.py
@@ -3,7 +3,7 @@
 from easy_rec.python.core.metrics import metric_learning_average_precision_at_k
 from easy_rec.python.core.metrics import metric_learning_recall_at_k
 from easy_rec.python.layers import dnn
-from easy_rec.python.layers.common_layers import gelu
+from easy_rec.python.utils.activation import gelu
 from easy_rec.python.layers.common_layers import highway
 from easy_rec.python.loss.circle_loss import circle_loss
 from easy_rec.python.loss.multi_similarity import ms_loss
diff --git a/easy_rec/python/utils/activation.py b/easy_rec/python/utils/activation.py
new file mode 100644
index 000000000..7b5d5248b
--- /dev/null
+++ b/easy_rec/python/utils/activation.py
@@ -0,0 +1,127 @@
+# -*- encoding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import tensorflow as tf
+import numpy as np
+import six
+from easy_rec.python.utils.load_class import load_by_path
+from tensorflow.python.keras.layers import Layer
+try:
+    from tensorflow.python.keras.layers import BatchNormalization
+except ImportError:
+    BatchNormalization = tf.keras.layers.BatchNormalization
+try:
+    from tensorflow.python.ops.init_ops import Zeros
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros
+
+
+class Dice(Layer):
+  """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data.
+    Input shape
+      - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
+    Output shape
+      - Same shape as the input.
+    Arguments
+      - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis).
+      - **epsilon** : Small float added to variance to avoid dividing by zero.
+    References
+      - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+  """
+
+  def __init__(self, axis=-1, epsilon=1e-9, is_training=None, **kwargs):
+    self.axis = axis
+    self.epsilon = epsilon
+    self.is_training = is_training
+    super(Dice, self).__init__(**kwargs)
+
+  def build(self, input_shape):
+    self.bn = BatchNormalization(
+      axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+    self.alphas = self.add_weight(shape=(input_shape[-1],), initializer=Zeros(
+    ), dtype=tf.float32, name='dice_alpha')  # name='alpha_'+self.name
+    super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
+    self.uses_learning_phase = True
+
+  def call(self, inputs, **kwargs):
+    inputs_normed = self.bn(inputs, training=self.is_training)
+    # tf.layers.batch_normalization(
+    # inputs, axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+    x_p = tf.sigmoid(inputs_normed)
+    return self.alphas * (1.0 - x_p) * inputs + x_p * inputs
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+  def get_config(self, ):
+    config = {'axis': self.axis, 'epsilon': self.epsilon}
+    base_config = super(Dice, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+def gelu(x):
+  """Gaussian Error Linear Unit.
+
+  This is a smoother version of the RELU.
+  Original paper: https://arxiv.org/abs/1606.08415
+  Args:
+    x: float Tensor to perform activation.
+
+  Returns:
+    `x` with the GELU activation applied.
+  """
+  cdf = 0.5 * (1.0 + tf.tanh(
+      (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+  return x * cdf
+
+
+def get_activation(activation_string, **kwargs):
+  """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
+
+  Args:
+    activation_string: String name of the activation function.
+
+  Returns:
+    A Python function corresponding to the activation function. If
+    `activation_string` is None, empty, or "linear", this will return None.
+    If `activation_string` is not a string, it will return `activation_string`.
+
+  Raises:
+    ValueError: The `activation_string` does not correspond to a known
+      activation.
+  """
+  # We assume that anything that's not a string is already an activation
+  # function, so we just return it.
+  if not isinstance(activation_string, six.string_types):
+    return activation_string
+
+  if not activation_string:
+    return None
+
+  act = activation_string.lower()
+  if act == 'linear':
+    return None
+  elif act == 'relu':
+    return tf.nn.relu
+  elif act == 'gelu':
+    return gelu
+  elif act == 'leaky_relu':
+    return tf.nn.leaky_relu
+  elif act in ('prelu', 'PRelu'):
+    return tf.keras.layers.PReLU(**kwargs)
+  elif act in ("dice", "Dice"):
+    return Dice(**kwargs)
+  elif act == 'elu':
+    return tf.nn.elu
+  elif act == 'selu':
+    return tf.nn.selu
+  elif act == 'tanh':
+    return tf.tanh
+  elif act == 'swish':
+    if tf.__version__ < '1.13.0':
+      return lambda x: x * tf.sigmoid(x)
+    return tf.nn.swish
+  elif act == 'sigmoid':
+    return tf.nn.sigmoid
+  else:
+    return load_by_path(activation_string)

From e666f41026539b723806d0918c989cc77d7c1acb Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 10 Mar 2023 16:55:35 +0800
Subject: [PATCH 03/54] [feat]: add sequence encoding module

---
 easy_rec/python/layers/dnn.py                 | 13 +++-
 .../layers/multihead_cross_attention.py       |  4 +-
 easy_rec/python/layers/sequence_encoder.py    | 13 ++--
 easy_rec/python/layers/uniter.py              |  2 +-
 .../model/collaborative_metric_learning.py    |  2 +-
 easy_rec/python/model/easy_rec_model.py       | 31 ++++----
 easy_rec/python/protos/layer.proto            |  2 +-
 easy_rec/python/utils/activation.py           | 71 +++++++++++--------
 8 files changed, 81 insertions(+), 57 deletions(-)

diff --git a/easy_rec/python/layers/dnn.py b/easy_rec/python/layers/dnn.py
index d2af5a4cf..3365f47f0 100644
--- a/easy_rec/python/layers/dnn.py
+++ b/easy_rec/python/layers/dnn.py
@@ -34,7 +34,13 @@ def __init__(self,
     self._name = name
     self._is_training = is_training
     logging.info('dnn activation function = %s' % self._config.activation)
-    self.activation = get_activation(self._config.activation, is_training=is_training)
+    if self._config.activation.lower() == 'dice':
+      self.activations = [
+          get_activation('dice', is_training=is_training, feat_dim=units)
+          for units in self.hidden_units
+      ]
+    else:
+      self.activation = get_activation(self._config.activation)
     self._last_layer_no_activation = last_layer_no_activation
     self._last_layer_no_batch_norm = last_layer_no_batch_norm
 
@@ -51,6 +57,7 @@ def __call__(self, deep_fea, hidden_layer_feature_output=False):
     if hidden_units_len == 1 and self.hidden_units[0] == 0:
       return deep_fea
 
+    act = self._config.activation.lower()
     hidden_feature_dict = {}
     for i, unit in enumerate(self.hidden_units):
       deep_fea = tf.layers.dense(
@@ -67,8 +74,8 @@ def __call__(self, deep_fea, hidden_layer_feature_output=False):
             trainable=True,
             name='%s/dnn_%d/bn' % (self._name, i))
       if (i + 1 < hidden_units_len) or not self._last_layer_no_activation:
-        deep_fea = self.activation(
-            deep_fea, name='%s/dnn_%d/act' % (self._name, i))
+        act_fn = self.activations[i] if act == 'dice' else self.activation
+        deep_fea = act_fn(deep_fea, name='%s/dnn_%d/act' % (self._name, i))
       if len(self.dropout_ratio) > 0 and self._is_training:
         assert self.dropout_ratio[
             i] < 1, 'invalid dropout_ratio: %.3f' % self.dropout_ratio[i]
diff --git a/easy_rec/python/layers/multihead_cross_attention.py b/easy_rec/python/layers/multihead_cross_attention.py
index bafb7e019..92b2b64df 100644
--- a/easy_rec/python/layers/multihead_cross_attention.py
+++ b/easy_rec/python/layers/multihead_cross_attention.py
@@ -5,10 +5,12 @@
 from __future__ import print_function
 
 import math
+
 import tensorflow as tf
+
 from easy_rec.python.compat.layers import layer_norm as tf_layer_norm
-from easy_rec.python.utils.shape_utils import get_shape_list
 from easy_rec.python.utils.activation import gelu
+from easy_rec.python.utils.shape_utils import get_shape_list
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
diff --git a/easy_rec/python/layers/sequence_encoder.py b/easy_rec/python/layers/sequence_encoder.py
index 0d141c094..e97ed8d09 100644
--- a/easy_rec/python/layers/sequence_encoder.py
+++ b/easy_rec/python/layers/sequence_encoder.py
@@ -7,8 +7,8 @@
 from easy_rec.python.compat import regularizers
 from easy_rec.python.layers import dnn
 from easy_rec.python.layers import multihead_cross_attention
-from easy_rec.python.utils.shape_utils import get_shape_list
 from easy_rec.python.utils.activation import get_activation
+from easy_rec.python.utils.shape_utils import get_shape_list
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -39,7 +39,7 @@ def __call__(self, features, group_name, is_training=True, *args, **kwargs):
       encoder_type = encoder.WhichOneof('encoder').lower()
       if encoder_type == 'bst':
         encoding = self.bst_encoder(seq_features, target_feature, group_name,
-                                    encoder.bst)
+                                    encoder.bst, is_training)
         outputs.append(encoding)
       elif encoder_type == 'din':
         encoding = self.din_encoder(seq_features, target_feature, group_name,
@@ -74,7 +74,7 @@ def din_encoder(self, seq_features, target_feature, group_name, config,
     din_all = tf.concat([queries, keys, queries - keys, queries * keys],
                         axis=-1)
     din_layer = dnn.DNN(
-        config.dnn,
+        config.attention_dnn,
         self._l2_reg,
         group_name + '/din_attention',
         is_training,
@@ -91,10 +91,12 @@ def din_encoder(self, seq_features, target_feature, group_name, config,
     scores = scores / (seq_emb_size**0.5)
     # normalization with softmax is abandoned according to the original paper
     scores = tf.nn.sigmoid(scores)
-    output = tf.squeeze(tf.matmul(scores, keys))
+    output = tf.squeeze(tf.matmul(scores, keys), axis=[1])
+    print('din output shape:', output.shape)
     return output
 
-  def bst_encoder(self, seq_features, target_feature, group_name, config, is_training):
+  def bst_encoder(self, seq_features, target_feature, group_name, config,
+                  is_training):
     if not is_training:
       config.hidden_dropout_prob = 0.0
       config.attention_probs_dropout_prob = 0.0
@@ -158,6 +160,7 @@ def bst_encoder(self, seq_features, target_feature, group_name, config, is_train
         name=group_name + '/bst')
     # attention_fea shape: [batch_size, seq_length, hidden_size]
     out_fea = attention_fea[:, 0, :]  # target feature
+    print('bst output shape:', out_fea.shape)
     return out_fea
 
 
diff --git a/easy_rec/python/layers/uniter.py b/easy_rec/python/layers/uniter.py
index 47ccc678c..3018bad61 100644
--- a/easy_rec/python/layers/uniter.py
+++ b/easy_rec/python/layers/uniter.py
@@ -4,8 +4,8 @@
 
 from easy_rec.python.layers import dnn
 from easy_rec.python.layers import multihead_cross_attention
-from easy_rec.python.utils.shape_utils import get_shape_list
 from easy_rec.python.utils.activation import get_activation
+from easy_rec.python.utils.shape_utils import get_shape_list
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
diff --git a/easy_rec/python/model/collaborative_metric_learning.py b/easy_rec/python/model/collaborative_metric_learning.py
index 7e5d7c008..d785e7141 100644
--- a/easy_rec/python/model/collaborative_metric_learning.py
+++ b/easy_rec/python/model/collaborative_metric_learning.py
@@ -3,12 +3,12 @@
 from easy_rec.python.core.metrics import metric_learning_average_precision_at_k
 from easy_rec.python.core.metrics import metric_learning_recall_at_k
 from easy_rec.python.layers import dnn
-from easy_rec.python.utils.activation import gelu
 from easy_rec.python.layers.common_layers import highway
 from easy_rec.python.loss.circle_loss import circle_loss
 from easy_rec.python.loss.multi_similarity import ms_loss
 from easy_rec.python.model.easy_rec_model import EasyRecModel
 from easy_rec.python.protos.loss_pb2 import LossType
+from easy_rec.python.utils.activation import gelu
 from easy_rec.python.utils.proto_util import copy_obj
 
 from easy_rec.python.protos.collaborative_metric_learning_pb2 import CoMetricLearningI2I as MetricLearningI2IConfig  # NOQA
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index eff1af32a..7815ed0de 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -105,20 +105,23 @@ def build_input_layer(self, model_config, feature_configs):
         is_training=self._is_training)
 
   def get_sequence_encoding(self, group_name=None, is_training=True):
-    if group_name is None:
-      seq_encoding = []
-      for group in self.feature_groups:
-        if len(group.sequence_encoders) == 0:
-          continue
-        encoding = self.get_sequence_encoding(group.group_name,
-                                              self._is_training)
-        if encoding is not None:
-          seq_encoding.append(encoding)
-      if seq_encoding:
-        return tf.concat(seq_encoding, axis=-1)
-      else:
-        return None
-    return self._sequence_encoder(self._feature_dict, group_name, is_training)
+    if group_name is not None:
+      return self._sequence_encoder(self._feature_dict, group_name, is_training)
+
+    seq_encoding = []
+    for group in self.feature_groups:
+      if len(group.sequence_encoders) == 0:
+        continue
+      encoding = self.get_sequence_encoding(group.group_name, self._is_training)
+      if encoding is not None:
+        seq_encoding.append(encoding)
+
+    if len(seq_encoding) > 1:
+      return tf.concat(seq_encoding, axis=-1)
+    elif len(seq_encoding) == 1:
+      return seq_encoding[0]
+    else:
+      return None
 
   @abstractmethod
   def build_predict_graph(self):
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index 482c5241f..814de794e 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -108,5 +108,5 @@ message BSTEncoder {
 
 message DINEncoder {
     // din attention layer
-    required DNN dnn = 1;
+    required DNN attention_dnn = 1;
 }
diff --git a/easy_rec/python/utils/activation.py b/easy_rec/python/utils/activation.py
index 7b5d5248b..39d9011c4 100644
--- a/easy_rec/python/utils/activation.py
+++ b/easy_rec/python/utils/activation.py
@@ -1,59 +1,66 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import tensorflow as tf
 import numpy as np
 import six
-from easy_rec.python.utils.load_class import load_by_path
+import tensorflow as tf
 from tensorflow.python.keras.layers import Layer
+
+from easy_rec.python.utils.load_class import load_by_path
+
 try:
-    from tensorflow.python.keras.layers import BatchNormalization
-except ImportError:
-    BatchNormalization = tf.keras.layers.BatchNormalization
-try:
-    from tensorflow.python.ops.init_ops import Zeros
+  from tensorflow.python.keras.layers import BatchNormalization
 except ImportError:
-    from tensorflow.python.ops.init_ops_v2 import Zeros
+  BatchNormalization = tf.keras.layers.BatchNormalization
 
 
 class Dice(Layer):
-  """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data.
-    Input shape
-      - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
-    Output shape
-      - Same shape as the input.
-    Arguments
-      - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis).
-      - **epsilon** : Small float added to variance to avoid dividing by zero.
-    References
-      - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+  """The Data Adaptive Activation Function in DIN.
+
+  Which can be viewed as a generalization of PReLu, and can adaptively adjust the rectified point
+   according to distribution of input data.
+
+  Input shape
+    - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis)
+     when using this layer as the first layer in a model.
+
+  Output shape
+    - Same shape as the input.
+
+  Arguments
+    - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis).
+    - **epsilon** : Small float added to variance to avoid dividing by zero.
+
+  References
+    - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]
+     Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
+     ACM, 2018: 1059-1068.] (https://arxiv.org/pdf/1706.06978.pdf)
   """
 
-  def __init__(self, axis=-1, epsilon=1e-9, is_training=None, **kwargs):
+  def __init__(self,
+               feat_dim,
+               axis=-1,
+               epsilon=1e-9,
+               is_training=None,
+               **kwargs):
+    super(Dice, self).__init__(**kwargs)
+    self.feat_dim = feat_dim
     self.axis = axis
     self.epsilon = epsilon
     self.is_training = is_training
-    super(Dice, self).__init__(**kwargs)
-
-  def build(self, input_shape):
     self.bn = BatchNormalization(
-      axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
-    self.alphas = self.add_weight(shape=(input_shape[-1],), initializer=Zeros(
-    ), dtype=tf.float32, name='dice_alpha')  # name='alpha_'+self.name
-    super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
-    self.uses_learning_phase = True
+        axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+    self.alphas = tf.Variable(tf.zeros([feat_dim]), dtype=tf.float32)
 
   def call(self, inputs, **kwargs):
     inputs_normed = self.bn(inputs, training=self.is_training)
-    # tf.layers.batch_normalization(
-    # inputs, axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
     x_p = tf.sigmoid(inputs_normed)
     return self.alphas * (1.0 - x_p) * inputs + x_p * inputs
 
   def compute_output_shape(self, input_shape):
     return input_shape
 
-  def get_config(self, ):
+  def get_config(self,):
     config = {'axis': self.axis, 'epsilon': self.epsilon}
     base_config = super(Dice, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -108,8 +115,10 @@ def get_activation(activation_string, **kwargs):
   elif act == 'leaky_relu':
     return tf.nn.leaky_relu
   elif act in ('prelu', 'PRelu'):
+    if len(kwargs) == 0:
+      return tf.nn.leaky_relu
     return tf.keras.layers.PReLU(**kwargs)
-  elif act in ("dice", "Dice"):
+  elif act in ('dice', 'Dice'):
     return Dice(**kwargs)
   elif act == 'elu':
     return tf.nn.elu

From 778e70eac41f93d61a5f3324664204ba57343d8b Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Sun, 12 Mar 2023 20:46:59 +0800
Subject: [PATCH 04/54] [feat]: add sequence encoding module

---
 easy_rec/python/layers/bst.py              |  90 +++++++++++
 easy_rec/python/layers/din.py              |  53 +++++++
 easy_rec/python/layers/dnn.py              |  16 +-
 easy_rec/python/layers/sequence_encoder.py | 169 ++++++---------------
 easy_rec/python/model/easy_rec_model.py    |  20 ++-
 easy_rec/python/protos/layer.proto         |   7 +-
 easy_rec/python/utils/activation.py        |  27 ++--
 7 files changed, 234 insertions(+), 148 deletions(-)
 create mode 100644 easy_rec/python/layers/bst.py
 create mode 100644 easy_rec/python/layers/din.py

diff --git a/easy_rec/python/layers/bst.py b/easy_rec/python/layers/bst.py
new file mode 100644
index 000000000..2bdb20c9d
--- /dev/null
+++ b/easy_rec/python/layers/bst.py
@@ -0,0 +1,90 @@
+# -*- encoding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+from tensorflow.python.keras.layers import Layer
+
+from easy_rec.python.layers import multihead_cross_attention
+from easy_rec.python.utils.activation import get_activation
+from easy_rec.python.utils.shape_utils import get_shape_list
+
+
+class BST(Layer):
+
+  def __init__(self, config, l2_reg, name='din', **kwargs):
+    super(BST, self).__init__(name=name, **kwargs)
+    self.l2_reg = l2_reg
+    self.config = config
+
+  def call(self, inputs, training=None, **kwargs):
+    seq_features, target_feature = inputs
+    if not training:
+      self.config.hidden_dropout_prob = 0.0
+      self.config.attention_probs_dropout_prob = 0.0
+
+    seq_embeds = [seq_fea for seq_fea, _ in seq_features]
+
+    max_position = self.config.max_position_embeddings
+    # max_seq_len: the max sequence length in current mini-batch, all sequences are padded to this length
+    batch_size, max_seq_len, _ = get_shape_list(seq_features[0][0], 3)
+    valid_len = tf.assert_less_equal(
+        max_seq_len,
+        max_position,
+        message='sequence length is greater than `max_position_embeddings`:' +
+        str(max_position) + ' in feature group:' + self.name)
+    with tf.control_dependencies([valid_len]):
+      # seq_input: [batch_size, seq_len, embed_size]
+      seq_input = tf.concat(seq_embeds, axis=-1)
+
+    # seq_len: [batch_size, 1], the true length of each sequence
+    seq_len = seq_features[0][1]
+    seq_embed_size = seq_input.shape.as_list()[-1]
+    if target_feature is not None:
+      target_size = target_feature.shape.as_list()[-1]
+      assert seq_embed_size == target_size, 'the embedding size of sequence and target item is not equal' \
+                                            ' in feature group:' + self.name
+      # target_feature: [batch_size, 1, embed_size]
+      target_feature = tf.expand_dims(target_feature, 1)
+      # seq_input: [batch_size, seq_len+1, embed_size]
+      seq_input = tf.concat([target_feature, seq_input], axis=1)
+      max_seq_len += 1
+      seq_len += 1
+      max_position += 1
+
+    seq_input = tf.layers.dense(
+        seq_input,
+        self.config.hidden_size,
+        activation=tf.nn.leaky_relu,
+        kernel_regularizer=self.l2_reg)
+
+    seq_fea = multihead_cross_attention.embedding_postprocessor(
+        seq_input,
+        position_embedding_name=self.name + '/position_embeddings',
+        max_position_embeddings=max_position)
+    seq_mask = tf.map_fn(
+        fn=lambda t: dynamic_mask(t, max_seq_len), elems=tf.to_int32(seq_len))
+    attention_mask = multihead_cross_attention.create_attention_mask_from_input_mask(
+        from_tensor=seq_fea, to_mask=seq_mask)
+
+    hidden_act = get_activation(self.config.hidden_act)
+    attention_fea = multihead_cross_attention.transformer_encoder(
+        seq_fea,
+        hidden_size=self.config.hidden_size,
+        num_hidden_layers=self.config.num_hidden_layers,
+        num_attention_heads=self.config.num_attention_heads,
+        attention_mask=attention_mask,
+        intermediate_size=self.config.intermediate_size,
+        intermediate_act_fn=hidden_act,
+        hidden_dropout_prob=self.config.hidden_dropout_prob,
+        attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
+        initializer_range=self.config.initializer_range,
+        name=self.name + '/bst')
+    # attention_fea shape: [batch_size, seq_length, hidden_size]
+    out_fea = attention_fea[:, 0, :]  # target feature
+    print('bst output shape:', out_fea.shape)
+    return out_fea
+
+
+def dynamic_mask(x, max_len):
+  ones = tf.ones(shape=tf.stack([x]), dtype=tf.int32)
+  zeros = tf.zeros(shape=tf.stack([max_len - x]), dtype=tf.int32)
+  return tf.concat([ones, zeros], axis=0)
diff --git a/easy_rec/python/layers/din.py b/easy_rec/python/layers/din.py
new file mode 100644
index 000000000..3b6f42df5
--- /dev/null
+++ b/easy_rec/python/layers/din.py
@@ -0,0 +1,53 @@
+# -*- encoding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+from tensorflow.python.keras.layers import Layer
+
+from easy_rec.python.layers import dnn
+from easy_rec.python.utils.shape_utils import get_shape_list
+
+
+class DIN(Layer):
+
+  def __init__(self, config, l2_reg, name='din', **kwargs):
+    super(DIN, self).__init__(name=name, **kwargs)
+    self.l2_reg = l2_reg
+    self.config = config
+
+  def call(self, inputs, training=None, **kwargs):
+    seq_features, target_feature = inputs
+    seq_input = [seq_fea for seq_fea, _ in seq_features]
+    keys = tf.concat(seq_input, axis=-1)
+
+    target_emb_size = target_feature.shape.as_list()[-1]
+    seq_emb_size = keys.shape.as_list()[-1]
+    assert target_emb_size == seq_emb_size, 'the embedding size of sequence and target item is not equal' \
+                                            ' in feature group:' + self.name
+
+    batch_size, max_seq_len, _ = get_shape_list(keys, 3)
+    queries = tf.tile(tf.expand_dims(target_feature, 1), [1, max_seq_len, 1])
+    din_all = tf.concat([queries, keys, queries - keys, queries * keys],
+                        axis=-1)
+    din_layer = dnn.DNN(
+        self.config.attention_dnn,
+        self.l2_reg,
+        self.name + '/din_attention',
+        training,
+        last_layer_no_activation=True,
+        last_layer_no_batch_norm=True)
+    output = din_layer(din_all)  # [B, L, 1]
+    scores = tf.transpose(output, [0, 2, 1])  # [B, 1, L]
+
+    seq_len = seq_features[0][1]
+    seq_mask = tf.sequence_mask(seq_len, max_seq_len, dtype=tf.bool)
+    seq_mask = tf.expand_dims(seq_mask, 1)
+    paddings = tf.ones_like(scores) * (-2**32 + 1)
+    scores = tf.where(seq_mask, scores, paddings)  # [B, 1, L]
+    scores = scores / (seq_emb_size**0.5)
+    # normalization with softmax is abandoned according to the original paper
+    scores = tf.nn.sigmoid(scores)
+    output = tf.squeeze(tf.matmul(scores, keys), axis=[1])
+    if self.config.need_target_feature:
+      output = tf.concat([output, target_feature], axis=-1)
+    print('din output shape:', output.shape)
+    return output
diff --git a/easy_rec/python/layers/dnn.py b/easy_rec/python/layers/dnn.py
index 3365f47f0..6016d6233 100644
--- a/easy_rec/python/layers/dnn.py
+++ b/easy_rec/python/layers/dnn.py
@@ -34,13 +34,10 @@ def __init__(self,
     self._name = name
     self._is_training = is_training
     logging.info('dnn activation function = %s' % self._config.activation)
-    if self._config.activation.lower() == 'dice':
-      self.activations = [
-          get_activation('dice', is_training=is_training, feat_dim=units)
-          for units in self.hidden_units
-      ]
-    else:
-      self.activation = get_activation(self._config.activation)
+    self.activations = [
+        get_activation(self._config.activation, is_training=is_training)
+        for _ in self.hidden_units
+    ]
     self._last_layer_no_activation = last_layer_no_activation
     self._last_layer_no_batch_norm = last_layer_no_batch_norm
 
@@ -57,7 +54,6 @@ def __call__(self, deep_fea, hidden_layer_feature_output=False):
     if hidden_units_len == 1 and self.hidden_units[0] == 0:
       return deep_fea
 
-    act = self._config.activation.lower()
     hidden_feature_dict = {}
     for i, unit in enumerate(self.hidden_units):
       deep_fea = tf.layers.dense(
@@ -74,8 +70,8 @@ def __call__(self, deep_fea, hidden_layer_feature_output=False):
             trainable=True,
             name='%s/dnn_%d/bn' % (self._name, i))
       if (i + 1 < hidden_units_len) or not self._last_layer_no_activation:
-        act_fn = self.activations[i] if act == 'dice' else self.activation
-        deep_fea = act_fn(deep_fea, name='%s/dnn_%d/act' % (self._name, i))
+        deep_fea = self.activations[i](
+            deep_fea, name='%s/dnn_%d/act' % (self._name, i))
       if len(self.dropout_ratio) > 0 and self._is_training:
         assert self.dropout_ratio[
             i] < 1, 'invalid dropout_ratio: %.3f' % self.dropout_ratio[i]
diff --git a/easy_rec/python/layers/sequence_encoder.py b/easy_rec/python/layers/sequence_encoder.py
index e97ed8d09..80c90eafa 100644
--- a/easy_rec/python/layers/sequence_encoder.py
+++ b/easy_rec/python/layers/sequence_encoder.py
@@ -4,11 +4,9 @@
 
 import tensorflow as tf
 
-from easy_rec.python.compat import regularizers
-from easy_rec.python.layers import dnn
-from easy_rec.python.layers import multihead_cross_attention
-from easy_rec.python.utils.activation import get_activation
-from easy_rec.python.utils.shape_utils import get_shape_list
+from easy_rec.python.layers.bst import BST
+from easy_rec.python.layers.din import DIN
+from easy_rec.python.protos.feature_config_pb2 import FeatureConfig
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -16,13 +14,51 @@
 
 class SequenceEncoder(object):
 
-  def __init__(self, input_layer, feature_groups_config, emb_reg, l2_reg):
+  def __init__(self, input_layer, feature_configs, feature_groups_config,
+               l2_reg):
     self._input_layer = input_layer
     self._feature_groups_config = {
         x.group_name: x for x in feature_groups_config
     }
-    self._emb_reg = emb_reg
     self._l2_reg = l2_reg
+    self._feature_config_by_name = {
+        x.feature_name if x.HasField('feature_name') else x.input_names[0]: x
+        for x in feature_configs
+    }
+
+    for name, group in self._feature_groups_config.items():
+      if len(group.sequence_encoders) == 0:
+        continue
+      check_share_emb = False
+      for encoder in group.sequence_encoders:
+        if encoder.force_share_embeddings:
+          check_share_emb = True
+          break
+      if not check_share_emb:
+        continue
+      if not self.check_share_embedding(group):
+        raise ValueError(
+            'sequence feature group `%s` check share embedding failed, '
+            'you should add `embedding_name` to feature config' % name)
+
+  def check_share_embedding(self, feature_group):
+    seq_emb_names = set()
+    target_emb_names = set()
+    for feature in feature_group.feature_names:
+      conf = self._feature_config_by_name[feature]
+      if not conf.HasField('embedding_name'):
+        return False
+      if conf.feature_type == FeatureConfig.FeatureType.SequenceFeature:
+        seq_emb_names.add(conf.embedding_name)
+      else:
+        target_emb_names.add(conf.embedding_name)
+
+    if seq_emb_names != target_emb_names:
+      tf.logging.error(
+          'sequence share embedding names: %s, target share embedding names: %s'
+          % (','.join(seq_emb_names), ','.join(target_emb_names)))
+      return False
+    return True
 
   def __call__(self, features, group_name, is_training=True, *args, **kwargs):
     group_config = self._feature_groups_config[group_name]
@@ -38,12 +74,12 @@ def __call__(self, features, group_name, is_training=True, *args, **kwargs):
     for encoder in group_config.sequence_encoders:
       encoder_type = encoder.WhichOneof('encoder').lower()
       if encoder_type == 'bst':
-        encoding = self.bst_encoder(seq_features, target_feature, group_name,
-                                    encoder.bst, is_training)
+        bst = BST(encoder.bst, self._l2_reg, name=group_name)
+        encoding = bst([seq_features, target_feature], is_training)
         outputs.append(encoding)
       elif encoder_type == 'din':
-        encoding = self.din_encoder(seq_features, target_feature, group_name,
-                                    encoder.din, is_training)
+        din = DIN(encoder.din, self._l2_reg, name=group_name)
+        encoding = din([seq_features, target_feature], is_training)
         outputs.append(encoding)
       else:
         assert False, 'unsupported sequence encode type: ' + encoder_type
@@ -57,114 +93,3 @@ def __call__(self, features, group_name, is_training=True, *args, **kwargs):
       return outputs[0]
 
     return tf.concat(outputs, axis=-1)
-
-  def din_encoder(self, seq_features, target_feature, group_name, config,
-                  is_training):
-    seq_input = [seq_fea for seq_fea, _ in seq_features]
-    regularizers.apply_regularization(self._emb_reg, weights_list=seq_input)
-    keys = tf.concat(seq_input, axis=-1)
-
-    target_emb_size = target_feature.shape.as_list()[-1]
-    seq_emb_size = keys.shape.as_list()[-1]
-    assert target_emb_size == seq_emb_size, 'the embedding size of sequence and target item is not equal' \
-                                            ' in feature group:' + group_name
-
-    batch_size, max_seq_len, _ = get_shape_list(keys, 3)
-    queries = tf.tile(tf.expand_dims(target_feature, 1), [1, max_seq_len, 1])
-    din_all = tf.concat([queries, keys, queries - keys, queries * keys],
-                        axis=-1)
-    din_layer = dnn.DNN(
-        config.attention_dnn,
-        self._l2_reg,
-        group_name + '/din_attention',
-        is_training,
-        last_layer_no_activation=True,
-        last_layer_no_batch_norm=True)
-    output = din_layer(din_all)  # [B, L, 1]
-    scores = tf.transpose(output, [0, 2, 1])  # [B, 1, L]
-
-    seq_len = seq_features[0][1]
-    seq_mask = tf.sequence_mask(seq_len, max_seq_len, dtype=tf.bool)
-    seq_mask = tf.expand_dims(seq_mask, 1)
-    paddings = tf.ones_like(scores) * (-2**32 + 1)
-    scores = tf.where(seq_mask, scores, paddings)  # [B, 1, L]
-    scores = scores / (seq_emb_size**0.5)
-    # normalization with softmax is abandoned according to the original paper
-    scores = tf.nn.sigmoid(scores)
-    output = tf.squeeze(tf.matmul(scores, keys), axis=[1])
-    print('din output shape:', output.shape)
-    return output
-
-  def bst_encoder(self, seq_features, target_feature, group_name, config,
-                  is_training):
-    if not is_training:
-      config.hidden_dropout_prob = 0.0
-      config.attention_probs_dropout_prob = 0.0
-
-    seq_embeds = [seq_fea for seq_fea, _ in seq_features]
-    regularizers.apply_regularization(self._emb_reg, weights_list=seq_embeds)
-
-    max_position = config.max_position_embeddings
-    batch_size, max_seq_len, _ = get_shape_list(seq_features[0][0], 3)
-    valid_len = tf.assert_less_equal(
-        max_seq_len,
-        max_position,
-        message='sequence length is greater than `max_position_embeddings`:' +
-        str(max_position) + ' in feature group:' + group_name)
-    with tf.control_dependencies([valid_len]):
-      # seq_input: [batch_size, seq_len, embed_size]
-      seq_input = tf.concat(seq_embeds, axis=-1)
-
-    # seq_len: [batch_size, ], 假设每个sequence feature的length都是相同的
-    seq_len = seq_features[0][1]
-    seq_embed_size = seq_input.shape.as_list()[-1]
-    if target_feature is not None:
-      target_size = target_feature.shape.as_list()[-1]
-      assert seq_embed_size == target_size, 'the embedding size of sequence and target item is not equal' \
-                                            ' in feature group:' + group_name
-      # target_feature: [batch_size, 1, embed_size]
-      target_feature = tf.expand_dims(target_feature, 1)
-      # seq_input: [batch_size, seq_len+1, embed_size]
-      seq_input = tf.concat([target_feature, seq_input], axis=1)
-      max_seq_len += 1
-      seq_len += 1
-
-    if seq_embed_size != config.hidden_size:
-      seq_input = tf.layers.dense(
-          seq_input,
-          config.hidden_size,
-          activation=tf.nn.relu,
-          kernel_regularizer=self._l2_reg)
-
-    seq_fea = multihead_cross_attention.embedding_postprocessor(
-        seq_input,
-        position_embedding_name=group_name + '_position_embeddings',
-        max_position_embeddings=max_position)
-    seq_mask = tf.map_fn(
-        fn=lambda t: dynamic_mask(t, max_seq_len), elems=tf.to_int32(seq_len))
-    attention_mask = multihead_cross_attention.create_attention_mask_from_input_mask(
-        from_tensor=seq_fea, to_mask=seq_mask)
-
-    hidden_act = get_activation(config.hidden_act)
-    attention_fea = multihead_cross_attention.transformer_encoder(
-        seq_fea,
-        hidden_size=config.hidden_size,
-        num_hidden_layers=config.num_hidden_layers,
-        num_attention_heads=config.num_attention_heads,
-        attention_mask=attention_mask,
-        intermediate_size=config.intermediate_size,
-        intermediate_act_fn=hidden_act,
-        hidden_dropout_prob=config.hidden_dropout_prob,
-        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
-        initializer_range=config.initializer_range,
-        name=group_name + '/bst')
-    # attention_fea shape: [batch_size, seq_length, hidden_size]
-    out_fea = attention_fea[:, 0, :]  # target feature
-    print('bst output shape:', out_fea.shape)
-    return out_fea
-
-
-def dynamic_mask(x, max_len):
-  ones = tf.ones(shape=tf.stack([x]), dtype=tf.int32)
-  zeros = tf.zeros(shape=tf.stack([max_len - x]), dtype=tf.int32)
-  return tf.concat([ones, zeros], axis=0)
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index 7815ed0de..e28660c45 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -61,9 +61,10 @@ def __init__(self,
     if constant.SAMPLE_WEIGHT in features:
       self._sample_weight = features[constant.SAMPLE_WEIGHT]
 
-    self._sequence_encoder = SequenceEncoder(self._input_layer,
+    self._sequence_encoder = SequenceEncoder(self._input_layer, feature_configs,
                                              model_config.feature_groups,
-                                             self._emb_reg, self._l2_reg)
+                                             self._l2_reg)
+    self._sequence_encoding_by_group_name = {}
 
   @property
   def embedding_regularization(self):
@@ -106,13 +107,24 @@ def build_input_layer(self, model_config, feature_configs):
 
   def get_sequence_encoding(self, group_name=None, is_training=True):
     if group_name is not None:
-      return self._sequence_encoder(self._feature_dict, group_name, is_training)
+      if group_name in self._sequence_encoding_by_group_name:
+        return self._sequence_encoding_by_group_name[group_name]
+      encoding = self._sequence_encoder(self._feature_dict, group_name,
+                                        is_training)
+      self._sequence_encoding_by_group_name[group_name] = encoding
+      return encoding
 
     seq_encoding = []
     for group in self.feature_groups:
       if len(group.sequence_encoders) == 0:
         continue
-      encoding = self.get_sequence_encoding(group.group_name, self._is_training)
+      group_name = group.group_name
+      if group_name in self._sequence_encoding_by_group_name:
+        encoding = self._sequence_encoding_by_group_name[group_name]
+      else:
+        encoding = self._sequence_encoder(self._feature_dict, group_name,
+                                          is_training)
+        self._sequence_encoding_by_group_name[group_name] = encoding
       if encoding is not None:
         seq_encoding.append(encoding)
 
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index 814de794e..a5917a38d 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -78,9 +78,10 @@ message UniterTower {
 message SequenceEncoder {
     // encoder parameters
     oneof encoder {
-        BSTEncoder bst = 1;
-        DINEncoder din = 2;
+        BSTEncoder bst = 101;
+        DINEncoder din = 102;
     }
+    required bool force_share_embeddings = 1 [default = true];
 }
 
 message BSTEncoder {
@@ -109,4 +110,6 @@ message BSTEncoder {
 message DINEncoder {
     // din attention layer
     required DNN attention_dnn = 1;
+    // whether to keep target item feature
+    required bool need_target_feature = 2 [default = true];
 }
diff --git a/easy_rec/python/utils/activation.py b/easy_rec/python/utils/activation.py
index 39d9011c4..25df2a486 100644
--- a/easy_rec/python/utils/activation.py
+++ b/easy_rec/python/utils/activation.py
@@ -13,6 +13,11 @@
 except ImportError:
   BatchNormalization = tf.keras.layers.BatchNormalization
 
+try:
+  from tensorflow.python.ops.init_ops import Zeros
+except ImportError:
+  from tensorflow.python.ops.init_ops_v2 import Zeros
+
 
 class Dice(Layer):
   """The Data Adaptive Activation Function in DIN.
@@ -37,20 +42,22 @@ class Dice(Layer):
      ACM, 2018: 1059-1068.] (https://arxiv.org/pdf/1706.06978.pdf)
   """
 
-  def __init__(self,
-               feat_dim,
-               axis=-1,
-               epsilon=1e-9,
-               is_training=None,
-               **kwargs):
+  def __init__(self, axis=-1, epsilon=1e-9, is_training=None, **kwargs):
     super(Dice, self).__init__(**kwargs)
-    self.feat_dim = feat_dim
     self.axis = axis
     self.epsilon = epsilon
     self.is_training = is_training
+
+  def build(self, input_shape):
+    super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
     self.bn = BatchNormalization(
         axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
-    self.alphas = tf.Variable(tf.zeros([feat_dim]), dtype=tf.float32)
+    self.alphas = self.add_weight(
+        shape=(input_shape[-1],),
+        initializer=Zeros(),
+        dtype=tf.float32,
+        name='dice_alpha')  # name='alpha_'+self.name
+    self.uses_learning_phase = True
 
   def call(self, inputs, **kwargs):
     inputs_normed = self.bn(inputs, training=self.is_training)
@@ -114,11 +121,11 @@ def get_activation(activation_string, **kwargs):
     return gelu
   elif act == 'leaky_relu':
     return tf.nn.leaky_relu
-  elif act in ('prelu', 'PRelu'):
+  elif act == 'prelu':
     if len(kwargs) == 0:
       return tf.nn.leaky_relu
     return tf.keras.layers.PReLU(**kwargs)
-  elif act in ('dice', 'Dice'):
+  elif act == 'dice':
     return Dice(**kwargs)
   elif act == 'elu':
     return tf.nn.elu

From 0254902ff454e34ccc9b88db3e40550f25fa0335 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Sat, 18 Mar 2023 19:49:41 +0800
Subject: [PATCH 05/54] [feat]: add pairwise logistic loss

---
 easy_rec/python/builders/loss_builder.py      |  36 ++++-
 .../python/compat/weight_decay_optimizers.py  |   7 +-
 easy_rec/python/layers/bst.py                 |  21 +--
 easy_rec/python/layers/din.py                 |  10 +-
 easy_rec/python/layers/dnn.py                 |   5 +-
 easy_rec/python/loss/focal_loss.py            |  62 ++++++++
 easy_rec/python/loss/pairwise_loss.py         | 136 ++++++++++++++++--
 easy_rec/python/model/multi_task_model.py     |   4 +-
 easy_rec/python/model/rank_model.py           |  38 +++--
 easy_rec/python/protos/loss.proto             |  29 ++++
 easy_rec/python/protos/tower.proto            |   6 +-
 easy_rec/python/utils/activation.py           |  36 +++--
 setup.cfg                                     |   2 +-
 13 files changed, 338 insertions(+), 54 deletions(-)
 create mode 100644 easy_rec/python/loss/focal_loss.py

diff --git a/easy_rec/python/builders/loss_builder.py b/easy_rec/python/builders/loss_builder.py
index a26372605..cf2751965 100644
--- a/easy_rec/python/builders/loss_builder.py
+++ b/easy_rec/python/builders/loss_builder.py
@@ -4,6 +4,9 @@
 
 import tensorflow as tf
 
+from easy_rec.python.loss.focal_loss import sigmoid_focal_loss_with_logits
+from easy_rec.python.loss.pairwise_loss import pairwise_focal_loss
+from easy_rec.python.loss.pairwise_loss import pairwise_logistic_loss
 from easy_rec.python.loss.pairwise_loss import pairwise_loss
 from easy_rec.python.protos.loss_pb2 import LossType
 
@@ -36,7 +39,28 @@ def build(loss_type,
     return tf.losses.mean_squared_error(
         labels=label, predictions=pred, weights=loss_weight, **kwargs)
   elif loss_type == LossType.PAIR_WISE_LOSS:
-    return pairwise_loss(label, pred)
+    session = kwargs.get('session_ids', None)
+    margin = 0 if loss_param is None else loss_param.margin
+    return pairwise_loss(
+        label, pred, session_ids=session, margin=margin, weights=loss_weight)
+  elif loss_type == LossType.PAIRWISE_LOGISTIC_LOSS:
+    session = kwargs.get('session_ids', None)
+    temp = 1.0 if loss_param is None else loss_param.temperature
+    return pairwise_logistic_loss(
+        label, pred, session_ids=session, temperature=temp, weights=loss_weight)
+  elif loss_type == LossType.PAIRWISE_FOCAL_LOSS:
+    session = kwargs.get('session_ids', None)
+    if loss_param is None:
+      return pairwise_focal_loss(
+          label, pred, session_ids=session, weights=loss_weight)
+    return pairwise_focal_loss(
+        label,
+        pred,
+        session_ids=session,
+        gamma=loss_param.gamma,
+        alpha=loss_param.alpha if loss_param.HasField('alpha') else None,
+        margin=loss_param.margin,
+        weights=loss_weight)
   elif loss_type == LossType.F1_REWEIGHTED_LOSS:
     f1_beta_square = 1.0 if loss_param is None else loss_param.f1_beta_square
     label_smoothing = 0 if loss_param is None else loss_param.label_smoothing
@@ -46,6 +70,16 @@ def build(loss_type,
         f1_beta_square,
         weights=loss_weight,
         label_smoothing=label_smoothing)
+  elif loss_type == LossType.BINARY_FOCAL_LOSS:
+    if loss_param is None:
+      return sigmoid_focal_loss_with_logits(
+          label, pred, sample_weights=loss_weight)
+    gamma = loss_param.gamma
+    alpha = None
+    if loss_param.HasField('alpha'):
+      alpha = loss_param.alpha
+    return sigmoid_focal_loss_with_logits(
+        label, pred, gamma=gamma, alpha=alpha, sample_weights=loss_weight)
   else:
     raise ValueError('unsupported loss type: %s' % LossType.Name(loss_type))
 
diff --git a/easy_rec/python/compat/weight_decay_optimizers.py b/easy_rec/python/compat/weight_decay_optimizers.py
index d29dce5bb..26eb9754f 100755
--- a/easy_rec/python/compat/weight_decay_optimizers.py
+++ b/easy_rec/python/compat/weight_decay_optimizers.py
@@ -411,10 +411,12 @@ def __init__(self,
 
 
 try:
-  from tensorflow.python.training import AdamAsyncOptimizer
+  # from tensorflow.python.training import AdamAsyncOptimizer
+  import tensorflow as tf
 
   @tf_export('contrib.opt.AdamAsyncWOptimizer')
-  class AdamAsyncWOptimizer(DecoupledWeightDecayExtension, AdamAsyncOptimizer):
+  class AdamAsyncWOptimizer(DecoupledWeightDecayExtension,
+                            tf.train.AdamAsyncOptimizer):
     """Optimizer that implements the Adam algorithm with weight decay.
 
     This is an implementation of the AdamW optimizer described in ["Fixing
@@ -472,4 +474,5 @@ def __init__(self,
           use_locking=use_locking,
           name=name)
 except ImportError:
+  print('import AdamAsyncOptimizer failed')
   pass
diff --git a/easy_rec/python/layers/bst.py b/easy_rec/python/layers/bst.py
index 2bdb20c9d..87e12770c 100644
--- a/easy_rec/python/layers/bst.py
+++ b/easy_rec/python/layers/bst.py
@@ -1,21 +1,23 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import tensorflow as tf
-from tensorflow.python.keras.layers import Layer
 
 from easy_rec.python.layers import multihead_cross_attention
 from easy_rec.python.utils.activation import get_activation
 from easy_rec.python.utils.shape_utils import get_shape_list
 
+# from tensorflow.python.keras.layers import Layer
 
-class BST(Layer):
+
+class BST(object):
 
   def __init__(self, config, l2_reg, name='din', **kwargs):
-    super(BST, self).__init__(name=name, **kwargs)
+    # super(BST, self).__init__(name=name, **kwargs)
+    self.name = name
     self.l2_reg = l2_reg
     self.config = config
 
-  def call(self, inputs, training=None, **kwargs):
+  def __call__(self, inputs, training=None, **kwargs):
     seq_features, target_feature = inputs
     if not training:
       self.config.hidden_dropout_prob = 0.0
@@ -50,11 +52,12 @@ def call(self, inputs, training=None, **kwargs):
       seq_len += 1
       max_position += 1
 
-    seq_input = tf.layers.dense(
-        seq_input,
-        self.config.hidden_size,
-        activation=tf.nn.leaky_relu,
-        kernel_regularizer=self.l2_reg)
+    if seq_embed_size != self.config.hidden_size:
+      seq_input = tf.layers.dense(
+          seq_input,
+          self.config.hidden_size,
+          activation=tf.nn.relu,
+          kernel_regularizer=self.l2_reg)
 
     seq_fea = multihead_cross_attention.embedding_postprocessor(
         seq_input,
diff --git a/easy_rec/python/layers/din.py b/easy_rec/python/layers/din.py
index 3b6f42df5..60d106fe3 100644
--- a/easy_rec/python/layers/din.py
+++ b/easy_rec/python/layers/din.py
@@ -1,20 +1,22 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import tensorflow as tf
-from tensorflow.python.keras.layers import Layer
 
 from easy_rec.python.layers import dnn
 from easy_rec.python.utils.shape_utils import get_shape_list
 
+# from tensorflow.python.keras.layers import Layer
 
-class DIN(Layer):
+
+class DIN(object):
 
   def __init__(self, config, l2_reg, name='din', **kwargs):
-    super(DIN, self).__init__(name=name, **kwargs)
+    # super(DIN, self).__init__(name=name, **kwargs)
+    self.name = name
     self.l2_reg = l2_reg
     self.config = config
 
-  def call(self, inputs, training=None, **kwargs):
+  def __call__(self, inputs, training=None, **kwargs):
     seq_features, target_feature = inputs
     seq_input = [seq_fea for seq_fea, _ in seq_features]
     keys = tf.concat(seq_input, axis=-1)
diff --git a/easy_rec/python/layers/dnn.py b/easy_rec/python/layers/dnn.py
index 6016d6233..74e355e82 100644
--- a/easy_rec/python/layers/dnn.py
+++ b/easy_rec/python/layers/dnn.py
@@ -35,8 +35,9 @@ def __init__(self,
     self._is_training = is_training
     logging.info('dnn activation function = %s' % self._config.activation)
     self.activations = [
-        get_activation(self._config.activation, is_training=is_training)
-        for _ in self.hidden_units
+        get_activation(
+            self._config.activation, is_training=is_training, feat_dim=units)
+        for units in self.hidden_units
     ]
     self._last_layer_no_activation = last_layer_no_activation
     self._last_layer_no_batch_norm = last_layer_no_batch_norm
diff --git a/easy_rec/python/loss/focal_loss.py b/easy_rec/python/loss/focal_loss.py
new file mode 100644
index 000000000..d596b7938
--- /dev/null
+++ b/easy_rec/python/loss/focal_loss.py
@@ -0,0 +1,62 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+
+import tensorflow as tf
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+def sigmoid_focal_loss_with_logits(labels,
+                                   logits,
+                                   gamma=2.0,
+                                   alpha=None,
+                                   sample_weights=None):
+  """Implements the focal loss function.
+
+  Focal loss was first introduced in the RetinaNet paper
+  (https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for
+  classification when you have highly imbalanced classes. It down-weights
+  well-classified examples and focuses on hard examples. The loss value is
+  much high for a sample which is misclassified by the classifier as compared
+  to the loss value corresponding to a well-classified example. One of the
+  best use-cases of focal loss is its usage in object detection where the
+  imbalance between the background class and other classes is extremely high.
+
+  Args
+      labels: true targets tensor.
+      logits: predictions tensor.
+      alpha: balancing factor.
+      gamma: modulating factor.
+
+  Returns:
+      Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the
+      same shape as `y_true`; otherwise, it is scalar.
+
+  Raises:
+      ValueError: If the shape of `sample_weight` is invalid or value of
+        `gamma` is less than zero
+  """
+  if gamma and gamma < 0:
+    raise ValueError('Value of gamma should be greater than or equal to zero')
+  logging.info('[focal_loss] gamma: {}, alpha: {}'.format(gamma, alpha))
+
+  y_true = tf.cast(labels, logits.dtype)
+
+  # convert the predictions into probabilities
+  y_pred = tf.nn.sigmoid(logits)
+  p_t = (y_true * y_pred) + ((1 - y_true) * (1 - y_pred))
+  weights = tf.pow((1 - p_t), gamma)
+
+  if alpha is not None:
+    alpha_factor = y_true * alpha + ((1 - alpha) * (1 - y_true))
+    weights *= alpha_factor
+
+  if sample_weights is not None:
+    if tf.is_numeric_tensor(sample_weights):
+      weights *= tf.cast(sample_weights, tf.float32)
+    else:
+      weights *= sample_weights
+
+  return tf.losses.sigmoid_cross_entropy(y_true, logits, weights=weights)
diff --git a/easy_rec/python/loss/pairwise_loss.py b/easy_rec/python/loss/pairwise_loss.py
index 9e16e3bdb..d2af10cb8 100644
--- a/easy_rec/python/loss/pairwise_loss.py
+++ b/easy_rec/python/loss/pairwise_loss.py
@@ -1,27 +1,147 @@
-# coding=utf-8
+# -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import logging
 
 import tensorflow as tf
+from focal_loss import sigmoid_focal_loss_with_logits
+from tensorflow.python.ops.losses.losses_impl import compute_weighted_loss
+
+from easy_rec.python.utils.shape_utils import get_shape_list
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
 
 
-def pairwise_loss(labels, logits):
-  pairwise_logits = tf.expand_dims(logits, -1) - tf.expand_dims(logits, 0)
-  logging.info('[pairwise_loss] pairwise logits: {}'.format(pairwise_logits))
+def pairwise_loss(labels, logits, session_ids=None, margin=0, weights=1.0):
+  """Pairwise loss.  Also see `pairwise_logistic_loss` below.
 
+  Args:
+    labels: a `Tensor` with shape [batch_size]. e.g. click or not click in the session.
+    logits: a `Tensor` with shape [batch_size]. e.g. the value of last neuron before activation.
+    session_ids: a `Tensor` with shape [batch_size]. Session ids of each sample, used to max GAUC metric. e.g. user_id
+    margin: the margin between positive and negative sample pair
+    weights: sample weights
+  """
+  logging.info('[pairwise_loss] margin: {}'.format(margin))
+  pairwise_logits = tf.math.subtract(
+      tf.expand_dims(logits, -1), tf.expand_dims(logits, 0)) - margin
   pairwise_mask = tf.greater(
       tf.expand_dims(labels, -1) - tf.expand_dims(labels, 0), 0)
-  logging.info('[pairwise_loss] mask: {}'.format(pairwise_mask))
+  if session_ids is not None:
+    logging.info('[pairwise_loss] use session ids')
+    group_equal = tf.equal(
+        tf.expand_dims(session_ids, -1), tf.expand_dims(session_ids, 0))
+    pairwise_mask = tf.logical_and(pairwise_mask, group_equal)
+
+  pairwise_logits = tf.boolean_mask(pairwise_logits, pairwise_mask)
+  pairwise_pseudo_labels = tf.ones_like(pairwise_logits)
+
+  if tf.is_numeric_tensor(weights):
+    logging.info('[pairwise_loss] use sample weight')
+    weights = tf.expand_dims(tf.cast(weights, tf.float32), -1)
+    batch_size, _ = get_shape_list(weights, 2)
+    pairwise_weights = tf.tile(weights, tf.stack([1, batch_size]))
+    pairwise_weights = tf.boolean_mask(pairwise_weights, pairwise_mask)
+  else:
+    pairwise_weights = weights
+
+  loss = tf.losses.sigmoid_cross_entropy(
+      pairwise_pseudo_labels, pairwise_logits, weights=pairwise_weights)
+  # set rank loss to zero if a batch has no positive sample.
+  loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
+  return loss
 
+
+def pairwise_focal_loss(labels,
+                        logits,
+                        session_ids=None,
+                        margin=0,
+                        gamma=2,
+                        alpha=None,
+                        weights=1.0):
+  logging.info('[pairwise_focal_loss] margin: {}, gamma: {}, alpha: {}'.format(
+      margin, gamma, alpha))
+  pairwise_logits = tf.math.subtract(
+      tf.expand_dims(logits, -1), tf.expand_dims(logits, 0)) - margin
+  pairwise_mask = tf.greater(
+      tf.expand_dims(labels, -1) - tf.expand_dims(labels, 0), 0)
+  if session_ids is not None:
+    logging.info('[pairwise_focal_loss] use session ids')
+    group_equal = tf.equal(
+        tf.expand_dims(session_ids, -1), tf.expand_dims(session_ids, 0))
+    pairwise_mask = tf.logical_and(pairwise_mask, group_equal)
   pairwise_logits = tf.boolean_mask(pairwise_logits, pairwise_mask)
-  logging.info('[pairwise_loss] after masking: {}'.format(pairwise_logits))
+
+  if tf.is_numeric_tensor(weights):
+    logging.info('[pairwise_focal_loss] use sample weight')
+    weights = tf.expand_dims(tf.cast(weights, tf.float32), -1)
+    batch_size, _ = get_shape_list(weights, 2)
+    pairwise_weights = tf.tile(weights, tf.stack([1, batch_size]))
+    pairwise_weights = tf.boolean_mask(pairwise_weights, pairwise_mask)
+  else:
+    pairwise_weights = weights
 
   pairwise_pseudo_labels = tf.ones_like(pairwise_logits)
-  loss = tf.losses.sigmoid_cross_entropy(pairwise_pseudo_labels,
-                                         pairwise_logits)
+  loss = sigmoid_focal_loss_with_logits(
+      pairwise_pseudo_labels,
+      pairwise_logits,
+      gamma=gamma,
+      alpha=alpha,
+      sample_weights=pairwise_weights)
+
+  # set rank loss to zero if a batch has no positive sample.
+  loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
+  return loss
+
+
+def pairwise_logistic_loss(labels,
+                           logits,
+                           session_ids=None,
+                           temperature=1.0,
+                           weights=1.0):
+  r"""Pairwise logistic loss.
+
+  Definition:
+  $$
+  \mathcal{L}(\{y\}, \{s\}) =
+  \sum_i \sum_j I[y_i > y_j] \log(1 + \exp(-(s_i - s_j)))
+  $$
+
+  Args:
+    labels: A `Tensor` of the same shape as `logits` representing graded
+      relevance.
+    logits: A `Tensor` with shape [batch_size].
+    session_ids: a `Tensor` with shape [batch_size]. Session ids of each sample, used to max GAUC metric. e.g. user_id
+    temperature: A float number to modify the scores=scores/temperature.
+    weights: A scalar, a `Tensor` with shape [batch_size] for each sample
+  """
+  logits /= temperature
+  pairwise_logits = tf.math.subtract(
+      tf.expand_dims(logits, -1), tf.expand_dims(logits, 0))
+
+  pairwise_mask = tf.greater(
+      tf.expand_dims(labels, -1) - tf.expand_dims(labels, 0), 0)
+  if session_ids is not None:
+    logging.info('[pairwise_logistic_loss] use session ids')
+    group_equal = tf.equal(
+        tf.expand_dims(session_ids, -1), tf.expand_dims(session_ids, 0))
+    pairwise_mask = tf.logical_and(pairwise_mask, group_equal)
+  pairwise_logits = tf.boolean_mask(pairwise_logits, pairwise_mask)
+
+  # The following is the same as log(1 + exp(-pairwise_logits)).
+  losses = tf.nn.relu(-pairwise_logits) + tf.math.log1p(
+      tf.exp(-tf.abs(pairwise_logits)))
+
+  if tf.is_numeric_tensor(weights):
+    logging.info('[pairwise_logistic_loss] use sample weight')
+    weights = tf.expand_dims(tf.cast(weights, tf.float32), -1)
+    batch_size, _ = get_shape_list(weights, 2)
+    pairwise_weights = tf.tile(weights, tf.stack([1, batch_size]))
+    pairwise_weights = tf.boolean_mask(pairwise_weights, pairwise_mask)
+  else:
+    pairwise_weights = weights
+
+  loss = compute_weighted_loss(losses, pairwise_weights)
   # set rank loss to zero if a batch has no positive sample.
   loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
   return loss
diff --git a/easy_rec/python/model/multi_task_model.py b/easy_rec/python/model/multi_task_model.py
index 677d5bc58..de321ab7d 100644
--- a/easy_rec/python/model/multi_task_model.py
+++ b/easy_rec/python/model/multi_task_model.py
@@ -89,7 +89,9 @@ def build_loss_graph(self):
     """Build loss graph for multi task model."""
     for task_tower_cfg in self._task_towers:
       tower_name = task_tower_cfg.tower_name
-      loss_weight = task_tower_cfg.weight * self._sample_weight
+      loss_weight = task_tower_cfg.weight
+      if task_tower_cfg.use_sample_weight:
+        loss_weight *= self._sample_weight
 
       if hasattr(task_tower_cfg, 'task_space_indicator_label') and \
           task_tower_cfg.HasField('task_space_indicator_label'):
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index c5cb118e6..65e1364a6 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -35,10 +35,16 @@ def _output_to_prediction_impl(self,
                                  num_class=1,
                                  suffix=''):
     prediction_dict = {}
-    if loss_type == LossType.F1_REWEIGHTED_LOSS or loss_type == LossType.PAIR_WISE_LOSS:
+    binary_loss_type = {
+        LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
+        LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
+        LossType.PAIRWISE_LOGISTIC_LOSS
+    }
+    if loss_type in binary_loss_type:
       assert num_class == 1, 'num_class must be 1 when loss type is F1_REWEIGHTED_LOSS/PAIR_WISE_LOSS'
       output = tf.squeeze(output, axis=1)
       probs = tf.sigmoid(output)
+      tf.summary.scalar('prediction/probs', tf.reduce_mean(probs))
       prediction_dict['logits' + suffix] = output
       prediction_dict['probs' + suffix] = probs
     elif loss_type == LossType.CLASSIFICATION:
@@ -96,7 +102,8 @@ def build_rtp_output_dict(self):
         loss_types = {loss.loss_type for loss in self._losses}
       binary_loss_set = {
           LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
-          LossType.PAIR_WISE_LOSS
+          LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
+          LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS
       }
       if loss_types & binary_loss_set:
         if 'probs' in self._prediction_dict:
@@ -117,7 +124,7 @@ def build_rtp_output_dict(self):
               + 't_graph() is called.')
       else:
         logging.warning(
-            'failed to build RTP rank_predict: unsupported loss type {}'.foramt(
+            'failed to build RTP rank_predict: unsupported loss type {}'.format(
                 loss_types))
       if forwarded is not None:
         rank_predict = tf.identity(forwarded, name='rank_predict')
@@ -133,14 +140,16 @@ def _build_loss_impl(self,
                        suffix='',
                        loss_param=None):
     loss_dict = {}
+    binary_loss_type = {
+        LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
+        LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
+        LossType.PAIRWISE_LOGISTIC_LOSS
+    }
     if loss_type == LossType.CLASSIFICATION:
       loss_name = 'cross_entropy_loss' + suffix
       pred = self._prediction_dict['logits' + suffix]
-    elif loss_type == LossType.F1_REWEIGHTED_LOSS:
-      loss_name = 'f1_reweighted_loss' + suffix
-      pred = self._prediction_dict['logits' + suffix]
-    elif loss_type == LossType.PAIR_WISE_LOSS:
-      loss_name = 'pairwise_loss' + suffix
+    elif loss_type in binary_loss_type:
+      loss_name = LossType.Name(loss_type).lower() + suffix
       pred = self._prediction_dict['logits' + suffix]
     elif loss_type in [LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS]:
       loss_name = 'l2_loss' + suffix
@@ -150,13 +159,18 @@ def _build_loss_impl(self,
 
     tf.summary.scalar('labels/%s' % label_name,
                       tf.reduce_mean(tf.to_float(self._labels[label_name])))
+    kwargs = {}
+    if loss_param is not None:
+      if hasattr(loss_param, 'session_name'):
+        kwargs['session_ids'] = self._labels[loss_param.session_name]
     loss_dict[loss_name] = loss_builder.build(
         loss_type,
         self._labels[label_name],
         pred,
         loss_weight,
         num_class,
-        loss_param=loss_param)
+        loss_param=loss_param,
+        **kwargs)
     return loss_dict
 
   def build_loss_graph(self):
@@ -202,7 +216,8 @@ def _build_metric_impl(self,
     from easy_rec.python.core import metrics as metrics_lib
     binary_loss_set = {
         LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
-        LossType.PAIR_WISE_LOSS
+        LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
+        LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS
     }
     metric_dict = {}
     if metric.WhichOneof('metric') == 'auc':
@@ -342,7 +357,8 @@ def build_metric_graph(self, eval_config):
   def _get_outputs_impl(self, loss_type, num_class=1, suffix=''):
     binary_loss_set = {
         LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
-        LossType.PAIR_WISE_LOSS
+        LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
+        LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS
     }
     if loss_type in binary_loss_set:
       if num_class == 1:
diff --git a/easy_rec/python/protos/loss.proto b/easy_rec/python/protos/loss.proto
index c0284711a..4b0f2fd5b 100644
--- a/easy_rec/python/protos/loss.proto
+++ b/easy_rec/python/protos/loss.proto
@@ -13,6 +13,9 @@ enum LossType {
     SOFTMAX_CROSS_ENTROPY_WITH_NEGATIVE_MINING = 7;
     PAIR_WISE_LOSS = 8;
     F1_REWEIGHTED_LOSS = 9;
+    BINARY_FOCAL_LOSS = 10;
+    PAIRWISE_FOCAL_LOSS = 11;
+    PAIRWISE_LOGISTIC_LOSS = 12;
 }
 
 message Loss {
@@ -23,6 +26,10 @@ message Loss {
     SoftmaxCrossEntropyWithNegativeMining softmax_loss = 102;
     CircleLoss circle_loss = 103;
     MultiSimilarityLoss multi_simi_loss = 104;
+    BinaryFocalLoss binary_focal_loss = 105;
+    PairwiseLoss pairwise_loss = 106;
+    PairwiseFocalLoss pairwise_focal_loss = 107;
+    PairwiseLogisticLoss pairwise_logistic_loss = 108;
   }
 };
 
@@ -49,3 +56,25 @@ message F1ReweighedLoss {
   required float f1_beta_square = 1 [default = 1.0];
   required float label_smoothing = 2 [default = 0];
 }
+
+message BinaryFocalLoss {
+  required float gamma = 1 [default = 2.0];
+  optional float alpha = 2;
+}
+
+message PairwiseLoss {
+  required float margin = 1 [default = 0];
+  optional string session_name = 2;
+}
+
+message PairwiseFocalLoss {
+  required float gamma = 1 [default = 2.0];
+  optional float alpha = 2;
+  required float margin = 3 [default = 0];
+  optional string session_name = 4;
+}
+
+message PairwiseLogisticLoss {
+  required float temperature = 1 [default = 1.0];
+  optional string session_name = 4;
+}
diff --git a/easy_rec/python/protos/tower.proto b/easy_rec/python/protos/tower.proto
index 02c6ce67c..580708825 100644
--- a/easy_rec/python/protos/tower.proto
+++ b/easy_rec/python/protos/tower.proto
@@ -26,7 +26,7 @@ message TaskTower {
     optional DNN dnn = 6;
     // training loss weights
     optional float weight = 7 [default = 1.0];
-    // label name for indcating the sample space for the task tower
+    // label name for indicating the sample space for the task tower
     optional string task_space_indicator_label = 10;
     // the loss weight for sample in the task space
     optional float in_task_space_weight = 11 [default = 1.0];
@@ -34,6 +34,8 @@ message TaskTower {
     optional float out_task_space_weight = 12 [default = 1.0];
     // multiple losses
     repeated Loss losses = 13;
+    // whether to use sample weight in this tower
+    required bool use_sample_weight = 14 [default = true];
 };
 
 
@@ -68,4 +70,6 @@ message BayesTaskTower {
     // optional float prediction_weight = 14 [default = 1.0];
     // multiple losses
     repeated Loss losses = 15;
+    // whether to use sample weight in this tower
+    required bool use_sample_weight = 16 [default = true];
 };
diff --git a/easy_rec/python/utils/activation.py b/easy_rec/python/utils/activation.py
index 25df2a486..d05d705b3 100644
--- a/easy_rec/python/utils/activation.py
+++ b/easy_rec/python/utils/activation.py
@@ -13,10 +13,10 @@
 except ImportError:
   BatchNormalization = tf.keras.layers.BatchNormalization
 
-try:
-  from tensorflow.python.ops.init_ops import Zeros
-except ImportError:
-  from tensorflow.python.ops.init_ops_v2 import Zeros
+# try:
+#   from tensorflow.python.ops.init_ops import Zeros
+# except ImportError:
+#   from tensorflow.python.ops.init_ops_v2 import Zeros
 
 
 class Dice(Layer):
@@ -42,22 +42,30 @@ class Dice(Layer):
      ACM, 2018: 1059-1068.] (https://arxiv.org/pdf/1706.06978.pdf)
   """
 
-  def __init__(self, axis=-1, epsilon=1e-9, is_training=None, **kwargs):
+  def __init__(self,
+               feat_dim,
+               axis=-1,
+               epsilon=1e-9,
+               is_training=None,
+               **kwargs):
     super(Dice, self).__init__(**kwargs)
     self.axis = axis
     self.epsilon = epsilon
     self.is_training = is_training
-
-  def build(self, input_shape):
-    super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
     self.bn = BatchNormalization(
         axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
-    self.alphas = self.add_weight(
-        shape=(input_shape[-1],),
-        initializer=Zeros(),
-        dtype=tf.float32,
-        name='dice_alpha')  # name='alpha_'+self.name
-    self.uses_learning_phase = True
+    self.alphas = tf.Variable(tf.zeros([feat_dim]), dtype=tf.float32)
+
+  # def build(self, input_shape):
+  #   super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
+  #   self.bn = BatchNormalization(
+  #       axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+  #   self.alphas = self.add_weight(
+  #       shape=(input_shape[-1],),
+  #       initializer=Zeros(),
+  #       dtype=tf.float32,
+  #       name='dice_alpha')  # name='alpha_'+self.name
+  #   self.uses_learning_phase = True
 
   def call(self, inputs, **kwargs):
     inputs_normed = self.bn(inputs, training=self.is_training)
diff --git a/setup.cfg b/setup.cfg
index b180b9fb1..469407312 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ multi_line_output = 7
 force_single_line = true
 known_standard_library = setuptools
 known_first_party = easy_rec
-known_third_party = absl,common_io,docutils,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
+known_third_party = absl,common_io,docutils,focal_loss,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
 no_lines_before = LOCALFOLDER
 default_section = THIRDPARTY
 skip = easy_rec/python/protos

From 6b54fe70f904292d57bb3aa6daf917b984a1f990 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 21 Mar 2023 00:47:32 +0800
Subject: [PATCH 06/54] [feat]: add pairwise logistic loss

---
 easy_rec/python/builders/loss_builder.py      | 40 ++++++--
 .../python/compat/weight_decay_optimizers.py  |  8 +-
 easy_rec/python/loss/focal_loss.py            | 35 ++++++-
 easy_rec/python/loss/pairwise_loss.py         | 95 +++++++++++++------
 easy_rec/python/model/rank_model.py           |  4 +-
 easy_rec/python/protos/loss.proto             |  9 +-
 6 files changed, 143 insertions(+), 48 deletions(-)

diff --git a/easy_rec/python/builders/loss_builder.py b/easy_rec/python/builders/loss_builder.py
index cf2751965..390b7996c 100644
--- a/easy_rec/python/builders/loss_builder.py
+++ b/easy_rec/python/builders/loss_builder.py
@@ -23,6 +23,7 @@ def build(loss_type,
           num_class=1,
           loss_param=None,
           **kwargs):
+  loss_name = kwargs.pop('loss_name')
   if loss_type == LossType.CLASSIFICATION:
     if num_class == 1:
       return tf.losses.sigmoid_cross_entropy(
@@ -42,25 +43,46 @@ def build(loss_type,
     session = kwargs.get('session_ids', None)
     margin = 0 if loss_param is None else loss_param.margin
     return pairwise_loss(
-        label, pred, session_ids=session, margin=margin, weights=loss_weight)
+        label,
+        pred,
+        session_ids=session,
+        margin=margin,
+        weights=loss_weight,
+        name=loss_name)
   elif loss_type == LossType.PAIRWISE_LOGISTIC_LOSS:
     session = kwargs.get('session_ids', None)
     temp = 1.0 if loss_param is None else loss_param.temperature
+    ohem_ratio = 1.0 if loss_param is None else loss_param.ohem_ratio
+    hinge_margin = None
+    if loss_param is not None and loss_param.HasField('hinge_margin'):
+      hinge_margin = loss_param.hinge_margin
     return pairwise_logistic_loss(
-        label, pred, session_ids=session, temperature=temp, weights=loss_weight)
+        label,
+        pred,
+        session_ids=session,
+        temperature=temp,
+        hinge_margin=hinge_margin,
+        ohem_ratio=ohem_ratio,
+        weights=loss_weight,
+        name=loss_name)
   elif loss_type == LossType.PAIRWISE_FOCAL_LOSS:
     session = kwargs.get('session_ids', None)
     if loss_param is None:
       return pairwise_focal_loss(
-          label, pred, session_ids=session, weights=loss_weight)
+          label, pred, session_ids=session, weights=loss_weight, name=loss_name)
+    hinge_margin = None
+    if loss_param.HasField('hinge_margin'):
+      hinge_margin = loss_param.hinge_margin
     return pairwise_focal_loss(
         label,
         pred,
         session_ids=session,
         gamma=loss_param.gamma,
         alpha=loss_param.alpha if loss_param.HasField('alpha') else None,
-        margin=loss_param.margin,
-        weights=loss_weight)
+        hinge_margin=hinge_margin,
+        ohem_ratio=loss_param.ohem_ratio,
+        weights=loss_weight,
+        name=loss_name)
   elif loss_type == LossType.F1_REWEIGHTED_LOSS:
     f1_beta_square = 1.0 if loss_param is None else loss_param.f1_beta_square
     label_smoothing = 0 if loss_param is None else loss_param.label_smoothing
@@ -79,7 +101,13 @@ def build(loss_type,
     if loss_param.HasField('alpha'):
       alpha = loss_param.alpha
     return sigmoid_focal_loss_with_logits(
-        label, pred, gamma=gamma, alpha=alpha, sample_weights=loss_weight)
+        label,
+        pred,
+        gamma=gamma,
+        alpha=alpha,
+        ohem_ratio=loss_param.ohem_ratio,
+        sample_weights=loss_weight,
+        label_smoothing=loss_param.label_smoothing)
   else:
     raise ValueError('unsupported loss type: %s' % LossType.Name(loss_type))
 
diff --git a/easy_rec/python/compat/weight_decay_optimizers.py b/easy_rec/python/compat/weight_decay_optimizers.py
index 26eb9754f..7c9baf905 100755
--- a/easy_rec/python/compat/weight_decay_optimizers.py
+++ b/easy_rec/python/compat/weight_decay_optimizers.py
@@ -411,12 +411,10 @@ def __init__(self,
 
 
 try:
-  # from tensorflow.python.training import AdamAsyncOptimizer
-  import tensorflow as tf
+  from tensorflow.train import AdamAsyncOptimizer
 
   @tf_export('contrib.opt.AdamAsyncWOptimizer')
-  class AdamAsyncWOptimizer(DecoupledWeightDecayExtension,
-                            tf.train.AdamAsyncOptimizer):
+  class AdamAsyncWOptimizer(DecoupledWeightDecayExtension, AdamAsyncOptimizer):
     """Optimizer that implements the Adam algorithm with weight decay.
 
     This is an implementation of the AdamW optimizer described in ["Fixing
@@ -474,5 +472,5 @@ def __init__(self,
           use_locking=use_locking,
           name=name)
 except ImportError:
-  print('import AdamAsyncOptimizer failed')
+  print('import AdamAsyncOptimizer failed when loading AdamAsyncWOptimizer')
   pass
diff --git a/easy_rec/python/loss/focal_loss.py b/easy_rec/python/loss/focal_loss.py
index d596b7938..2e322782e 100644
--- a/easy_rec/python/loss/focal_loss.py
+++ b/easy_rec/python/loss/focal_loss.py
@@ -12,7 +12,9 @@ def sigmoid_focal_loss_with_logits(labels,
                                    logits,
                                    gamma=2.0,
                                    alpha=None,
-                                   sample_weights=None):
+                                   ohem_ratio=1.0,
+                                   sample_weights=None,
+                                   label_smoothing=0):
   """Implements the focal loss function.
 
   Focal loss was first introduced in the RetinaNet paper
@@ -25,10 +27,15 @@ def sigmoid_focal_loss_with_logits(labels,
   imbalance between the background class and other classes is extremely high.
 
   Args
-      labels: true targets tensor.
-      logits: predictions tensor.
+      labels: `[batch_size]` target integer labels in `{0, 1}`.
+      logits: Float `[batch_size]` logits outputs of the network.
       alpha: balancing factor.
       gamma: modulating factor.
+      ohem_ratio: the percent of hard examples to be mined
+      sample_weights:  Optional `Tensor` whose rank is either 0, or the same rank as
+        `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+        be either `1`, or the same as the corresponding `losses` dimension).
+      label_smoothing: If greater than `0` then smooth the labels.
 
   Returns:
       Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the
@@ -38,14 +45,19 @@ def sigmoid_focal_loss_with_logits(labels,
       ValueError: If the shape of `sample_weight` is invalid or value of
         `gamma` is less than zero
   """
+  assert 0 < ohem_ratio <= 1.0, 'ohem_ratio must be in (0, 1]'
   if gamma and gamma < 0:
     raise ValueError('Value of gamma should be greater than or equal to zero')
-  logging.info('[focal_loss] gamma: {}, alpha: {}'.format(gamma, alpha))
+  logging.info(
+      '[focal_loss] gamma: {}, alpha: {}, ohem_ratho: {}, label smoothing: {}'
+      .format(gamma, alpha, ohem_ratio, label_smoothing))
 
   y_true = tf.cast(labels, logits.dtype)
 
   # convert the predictions into probabilities
   y_pred = tf.nn.sigmoid(logits)
+  epsilon = 1e-7
+  y_pred = tf.clip_by_value(y_pred, epsilon, 1 - epsilon)
   p_t = (y_true * y_pred) + ((1 - y_true) * (1 - y_pred))
   weights = tf.pow((1 - p_t), gamma)
 
@@ -59,4 +71,17 @@ def sigmoid_focal_loss_with_logits(labels,
     else:
       weights *= sample_weights
 
-  return tf.losses.sigmoid_cross_entropy(y_true, logits, weights=weights)
+  if ohem_ratio == 1.0:
+    return tf.losses.sigmoid_cross_entropy(
+        y_true, logits, weights=weights, label_smoothing=label_smoothing)
+
+  losses = tf.losses.sigmoid_cross_entropy(
+      y_true,
+      logits,
+      weights=weights,
+      label_smoothing=label_smoothing,
+      reduction=tf.losses.Reduction.NONE)
+  k = tf.size(losses) * ohem_ratio
+  topk = tf.nn.top_k(losses, k)
+  losses = tf.boolean_mask(topk.values, topk.values > 0)
+  return tf.reduce_mean(losses)
diff --git a/easy_rec/python/loss/pairwise_loss.py b/easy_rec/python/loss/pairwise_loss.py
index d2af10cb8..1a9382ab7 100644
--- a/easy_rec/python/loss/pairwise_loss.py
+++ b/easy_rec/python/loss/pairwise_loss.py
@@ -12,8 +12,13 @@
   tf = tf.compat.v1
 
 
-def pairwise_loss(labels, logits, session_ids=None, margin=0, weights=1.0):
-  """Pairwise loss.  Also see `pairwise_logistic_loss` below.
+def pairwise_loss(labels,
+                  logits,
+                  session_ids=None,
+                  margin=0,
+                  weights=1.0,
+                  name=''):
+  """Deprecated Pairwise loss.  Also see `pairwise_logistic_loss` below.
 
   Args:
     labels: a `Tensor` with shape [batch_size]. e.g. click or not click in the session.
@@ -21,23 +26,26 @@ def pairwise_loss(labels, logits, session_ids=None, margin=0, weights=1.0):
     session_ids: a `Tensor` with shape [batch_size]. Session ids of each sample, used to max GAUC metric. e.g. user_id
     margin: the margin between positive and negative sample pair
     weights: sample weights
+    name: the name of loss
   """
-  logging.info('[pairwise_loss] margin: {}'.format(margin))
+  loss_name = name if name else 'pairwise_logistic_loss'
+  logging.info('[{}] margin: {}'.format(loss_name, margin))
   pairwise_logits = tf.math.subtract(
       tf.expand_dims(logits, -1), tf.expand_dims(logits, 0)) - margin
   pairwise_mask = tf.greater(
       tf.expand_dims(labels, -1) - tf.expand_dims(labels, 0), 0)
   if session_ids is not None:
-    logging.info('[pairwise_loss] use session ids')
+    logging.info('[%s] use session ids' % loss_name)
     group_equal = tf.equal(
         tf.expand_dims(session_ids, -1), tf.expand_dims(session_ids, 0))
     pairwise_mask = tf.logical_and(pairwise_mask, group_equal)
 
   pairwise_logits = tf.boolean_mask(pairwise_logits, pairwise_mask)
-  pairwise_pseudo_labels = tf.ones_like(pairwise_logits)
+  num_pair = tf.size(pairwise_logits)
+  tf.summary.scalar('loss/%s_num_of_pairs' % loss_name, num_pair)
 
   if tf.is_numeric_tensor(weights):
-    logging.info('[pairwise_loss] use sample weight')
+    logging.info('[%s] use sample weight' % loss_name)
     weights = tf.expand_dims(tf.cast(weights, tf.float32), -1)
     batch_size, _ = get_shape_list(weights, 2)
     pairwise_weights = tf.tile(weights, tf.stack([1, batch_size]))
@@ -45,35 +53,48 @@ def pairwise_loss(labels, logits, session_ids=None, margin=0, weights=1.0):
   else:
     pairwise_weights = weights
 
+  pairwise_pseudo_labels = tf.ones_like(pairwise_logits)
   loss = tf.losses.sigmoid_cross_entropy(
       pairwise_pseudo_labels, pairwise_logits, weights=pairwise_weights)
   # set rank loss to zero if a batch has no positive sample.
-  loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
+  # loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
   return loss
 
 
 def pairwise_focal_loss(labels,
                         logits,
                         session_ids=None,
-                        margin=0,
+                        hinge_margin=None,
                         gamma=2,
                         alpha=None,
-                        weights=1.0):
-  logging.info('[pairwise_focal_loss] margin: {}, gamma: {}, alpha: {}'.format(
-      margin, gamma, alpha))
-  pairwise_logits = tf.math.subtract(
-      tf.expand_dims(logits, -1), tf.expand_dims(logits, 0)) - margin
+                        weights=1.0,
+                        ohem_ratio=1.0,
+                        name=''):
+  loss_name = name if name else 'pairwise_focal_loss'
+  logging.info(
+      '[{}] hinge margin: {}, gamma: {}, alpha: {}, ohem_ratio: {}'.format(
+          loss_name, hinge_margin, gamma, alpha, ohem_ratio))
+  assert 0 < ohem_ratio <= 1.0, 'ohem_ratio must be in (0, 1]'
+
+  pairwise_logits = tf.expand_dims(logits, -1) - tf.expand_dims(logits, 0)
+
   pairwise_mask = tf.greater(
       tf.expand_dims(labels, -1) - tf.expand_dims(labels, 0), 0)
+  if hinge_margin is not None:
+    hinge_mask = tf.less(pairwise_logits, hinge_margin)
+    pairwise_mask = tf.logical_and(pairwise_mask, hinge_mask)
   if session_ids is not None:
-    logging.info('[pairwise_focal_loss] use session ids')
+    logging.info('[%s] use session ids' % loss_name)
     group_equal = tf.equal(
         tf.expand_dims(session_ids, -1), tf.expand_dims(session_ids, 0))
     pairwise_mask = tf.logical_and(pairwise_mask, group_equal)
+
   pairwise_logits = tf.boolean_mask(pairwise_logits, pairwise_mask)
+  num_pair = tf.size(pairwise_logits)
+  tf.summary.scalar('loss/%s_num_of_pairs' % loss_name, num_pair)
 
   if tf.is_numeric_tensor(weights):
-    logging.info('[pairwise_focal_loss] use sample weight')
+    logging.info('[%s] use sample weight' % loss_name)
     weights = tf.expand_dims(tf.cast(weights, tf.float32), -1)
     batch_size, _ = get_shape_list(weights, 2)
     pairwise_weights = tf.tile(weights, tf.stack([1, batch_size]))
@@ -87,10 +108,8 @@ def pairwise_focal_loss(labels,
       pairwise_logits,
       gamma=gamma,
       alpha=alpha,
+      ohem_ratio=ohem_ratio,
       sample_weights=pairwise_weights)
-
-  # set rank loss to zero if a batch has no positive sample.
-  loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
   return loss
 
 
@@ -98,8 +117,11 @@ def pairwise_logistic_loss(labels,
                            logits,
                            session_ids=None,
                            temperature=1.0,
-                           weights=1.0):
-  r"""Pairwise logistic loss.
+                           hinge_margin=None,
+                           weights=1.0,
+                           ohem_ratio=1.0,
+                           name=''):
+  r"""Computes pairwise logistic loss between `labels` and `logits`.
 
   Definition:
   $$
@@ -112,28 +134,40 @@ def pairwise_logistic_loss(labels,
       relevance.
     logits: A `Tensor` with shape [batch_size].
     session_ids: a `Tensor` with shape [batch_size]. Session ids of each sample, used to max GAUC metric. e.g. user_id
-    temperature: A float number to modify the scores=scores/temperature.
+    temperature: (Optional) The temperature to use for scaling the logits.
+    hinge_margin: the margin between positive and negative logits
     weights: A scalar, a `Tensor` with shape [batch_size] for each sample
+    ohem_ratio: the percent of hard examples to be mined
+    name: the name of loss
   """
-  logits /= temperature
+  assert 0 < ohem_ratio <= 1.0, 'ohem_ratio must be in (0, 1]'
+  loss_name = name if name else 'pairwise_logistic_loss'
+  if temperature != 1.0:
+    logits /= temperature
   pairwise_logits = tf.math.subtract(
       tf.expand_dims(logits, -1), tf.expand_dims(logits, 0))
 
   pairwise_mask = tf.greater(
       tf.expand_dims(labels, -1) - tf.expand_dims(labels, 0), 0)
+  if hinge_margin is not None:
+    hinge_mask = tf.less(pairwise_logits, hinge_margin)
+    pairwise_mask = tf.logical_and(pairwise_mask, hinge_mask)
   if session_ids is not None:
-    logging.info('[pairwise_logistic_loss] use session ids')
+    logging.info('[%s] use session ids' % loss_name)
     group_equal = tf.equal(
         tf.expand_dims(session_ids, -1), tf.expand_dims(session_ids, 0))
     pairwise_mask = tf.logical_and(pairwise_mask, group_equal)
+
   pairwise_logits = tf.boolean_mask(pairwise_logits, pairwise_mask)
+  num_pair = tf.size(pairwise_logits)
+  tf.summary.scalar('loss/%s_num_of_pairs' % loss_name, num_pair)
 
   # The following is the same as log(1 + exp(-pairwise_logits)).
   losses = tf.nn.relu(-pairwise_logits) + tf.math.log1p(
       tf.exp(-tf.abs(pairwise_logits)))
 
   if tf.is_numeric_tensor(weights):
-    logging.info('[pairwise_logistic_loss] use sample weight')
+    logging.info('[%s] use sample weight' % loss_name)
     weights = tf.expand_dims(tf.cast(weights, tf.float32), -1)
     batch_size, _ = get_shape_list(weights, 2)
     pairwise_weights = tf.tile(weights, tf.stack([1, batch_size]))
@@ -141,7 +175,12 @@ def pairwise_logistic_loss(labels,
   else:
     pairwise_weights = weights
 
-  loss = compute_weighted_loss(losses, pairwise_weights)
-  # set rank loss to zero if a batch has no positive sample.
-  loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
-  return loss
+  if ohem_ratio == 1.0:
+    return compute_weighted_loss(losses, pairwise_weights)
+
+  losses = compute_weighted_loss(
+      losses, pairwise_weights, reduction=tf.losses.Reduction.NONE)
+  k = tf.size(losses) * ohem_ratio
+  topk = tf.nn.top_k(losses, k)
+  losses = tf.boolean_mask(topk.values, topk.values > 0)
+  return tf.reduce_mean(losses)
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index 65e1364a6..a4bce730e 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -159,10 +159,10 @@ def _build_loss_impl(self,
 
     tf.summary.scalar('labels/%s' % label_name,
                       tf.reduce_mean(tf.to_float(self._labels[label_name])))
-    kwargs = {}
+    kwargs = {'loss_name': loss_name}
     if loss_param is not None:
       if hasattr(loss_param, 'session_name'):
-        kwargs['session_ids'] = self._labels[loss_param.session_name]
+        kwargs['session_ids'] = self._feature_dict[loss_param.session_name]
     loss_dict[loss_name] = loss_builder.build(
         loss_type,
         self._labels[label_name],
diff --git a/easy_rec/python/protos/loss.proto b/easy_rec/python/protos/loss.proto
index 4b0f2fd5b..7a6be0238 100644
--- a/easy_rec/python/protos/loss.proto
+++ b/easy_rec/python/protos/loss.proto
@@ -60,6 +60,8 @@ message F1ReweighedLoss {
 message BinaryFocalLoss {
   required float gamma = 1 [default = 2.0];
   optional float alpha = 2;
+  optional float ohem_ratio = 3 [default = 1.0];
+  optional float label_smoothing = 4 [default = 0];
 }
 
 message PairwiseLoss {
@@ -70,11 +72,14 @@ message PairwiseLoss {
 message PairwiseFocalLoss {
   required float gamma = 1 [default = 2.0];
   optional float alpha = 2;
-  required float margin = 3 [default = 0];
+  optional float hinge_margin = 3 [default = 1.0];
   optional string session_name = 4;
+  optional float ohem_ratio = 5 [default = 1.0];
 }
 
 message PairwiseLogisticLoss {
   required float temperature = 1 [default = 1.0];
-  optional string session_name = 4;
+  optional string session_name = 2;
+  optional float hinge_margin = 3 [default = 1.0];
+  optional float ohem_ratio = 4 [default = 1.0];
 }

From d65ece39ac191bfa4307d240e5106277653c643d Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 21 Mar 2023 10:36:12 +0800
Subject: [PATCH 07/54] [feat]: add pairwise logistic loss

---
 easy_rec/python/builders/loss_builder.py |  8 ++++++--
 easy_rec/python/loss/focal_loss.py       | 12 +++++++----
 easy_rec/python/loss/pairwise_loss.py    | 26 +++++++++++++++++-------
 easy_rec/python/protos/loss.proto        |  2 ++
 4 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/easy_rec/python/builders/loss_builder.py b/easy_rec/python/builders/loss_builder.py
index 390b7996c..203e3279d 100644
--- a/easy_rec/python/builders/loss_builder.py
+++ b/easy_rec/python/builders/loss_builder.py
@@ -42,11 +42,13 @@ def build(loss_type,
   elif loss_type == LossType.PAIR_WISE_LOSS:
     session = kwargs.get('session_ids', None)
     margin = 0 if loss_param is None else loss_param.margin
+    temp = 1.0 if loss_param is None else loss_param.temperature
     return pairwise_loss(
         label,
         pred,
         session_ids=session,
         margin=margin,
+        temperature=temp,
         weights=loss_weight,
         name=loss_name)
   elif loss_type == LossType.PAIRWISE_LOGISTIC_LOSS:
@@ -81,6 +83,7 @@ def build(loss_type,
         alpha=loss_param.alpha if loss_param.HasField('alpha') else None,
         hinge_margin=hinge_margin,
         ohem_ratio=loss_param.ohem_ratio,
+        temperature=loss_param.temperature,
         weights=loss_weight,
         name=loss_name)
   elif loss_type == LossType.F1_REWEIGHTED_LOSS:
@@ -95,7 +98,7 @@ def build(loss_type,
   elif loss_type == LossType.BINARY_FOCAL_LOSS:
     if loss_param is None:
       return sigmoid_focal_loss_with_logits(
-          label, pred, sample_weights=loss_weight)
+          label, pred, sample_weights=loss_weight, name=loss_name)
     gamma = loss_param.gamma
     alpha = None
     if loss_param.HasField('alpha'):
@@ -107,7 +110,8 @@ def build(loss_type,
         alpha=alpha,
         ohem_ratio=loss_param.ohem_ratio,
         sample_weights=loss_weight,
-        label_smoothing=loss_param.label_smoothing)
+        label_smoothing=loss_param.label_smoothing,
+        name=loss_name)
   else:
     raise ValueError('unsupported loss type: %s' % LossType.Name(loss_type))
 
diff --git a/easy_rec/python/loss/focal_loss.py b/easy_rec/python/loss/focal_loss.py
index 2e322782e..515cb506b 100644
--- a/easy_rec/python/loss/focal_loss.py
+++ b/easy_rec/python/loss/focal_loss.py
@@ -14,7 +14,8 @@ def sigmoid_focal_loss_with_logits(labels,
                                    alpha=None,
                                    ohem_ratio=1.0,
                                    sample_weights=None,
-                                   label_smoothing=0):
+                                   label_smoothing=0,
+                                   name=''):
   """Implements the focal loss function.
 
   Focal loss was first introduced in the RetinaNet paper
@@ -36,6 +37,7 @@ def sigmoid_focal_loss_with_logits(labels,
         `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
         be either `1`, or the same as the corresponding `losses` dimension).
       label_smoothing: If greater than `0` then smooth the labels.
+      name: the name of loss
 
   Returns:
       Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the
@@ -45,12 +47,13 @@ def sigmoid_focal_loss_with_logits(labels,
       ValueError: If the shape of `sample_weight` is invalid or value of
         `gamma` is less than zero
   """
-  assert 0 < ohem_ratio <= 1.0, 'ohem_ratio must be in (0, 1]'
+  loss_name = name if name else 'focal_loss'
+  assert 0 < ohem_ratio <= 1.0, loss_name + ' ohem_ratio must be in (0, 1]'
   if gamma and gamma < 0:
     raise ValueError('Value of gamma should be greater than or equal to zero')
   logging.info(
-      '[focal_loss] gamma: {}, alpha: {}, ohem_ratho: {}, label smoothing: {}'
-      .format(gamma, alpha, ohem_ratio, label_smoothing))
+      '[{}] gamma: {}, alpha: {}, ohem_ratho: {}, label smoothing: {}'.format(
+          loss_name, gamma, alpha, ohem_ratio, label_smoothing))
 
   y_true = tf.cast(labels, logits.dtype)
 
@@ -66,6 +69,7 @@ def sigmoid_focal_loss_with_logits(labels,
     weights *= alpha_factor
 
   if sample_weights is not None:
+    logging.info('[%s] use sample weight' % loss_name)
     if tf.is_numeric_tensor(sample_weights):
       weights *= tf.cast(sample_weights, tf.float32)
     else:
diff --git a/easy_rec/python/loss/pairwise_loss.py b/easy_rec/python/loss/pairwise_loss.py
index 1a9382ab7..a54c6d0a7 100644
--- a/easy_rec/python/loss/pairwise_loss.py
+++ b/easy_rec/python/loss/pairwise_loss.py
@@ -16,6 +16,7 @@ def pairwise_loss(labels,
                   logits,
                   session_ids=None,
                   margin=0,
+                  temperature=1.0,
                   weights=1.0,
                   name=''):
   """Deprecated Pairwise loss.  Also see `pairwise_logistic_loss` below.
@@ -25,11 +26,16 @@ def pairwise_loss(labels,
     logits: a `Tensor` with shape [batch_size]. e.g. the value of last neuron before activation.
     session_ids: a `Tensor` with shape [batch_size]. Session ids of each sample, used to max GAUC metric. e.g. user_id
     margin: the margin between positive and negative sample pair
+    temperature: (Optional) The temperature to use for scaling the logits.
     weights: sample weights
     name: the name of loss
   """
-  loss_name = name if name else 'pairwise_logistic_loss'
-  logging.info('[{}] margin: {}'.format(loss_name, margin))
+  loss_name = name if name else 'pairwise_loss'
+  logging.info('[{}] margin: {}, temperature: {}'.format(
+      loss_name, margin, temperature))
+
+  if temperature != 1.0:
+    logits /= temperature
   pairwise_logits = tf.math.subtract(
       tf.expand_dims(logits, -1), tf.expand_dims(logits, 0)) - margin
   pairwise_mask = tf.greater(
@@ -67,15 +73,18 @@ def pairwise_focal_loss(labels,
                         hinge_margin=None,
                         gamma=2,
                         alpha=None,
-                        weights=1.0,
                         ohem_ratio=1.0,
+                        temperature=1.0,
+                        weights=1.0,
                         name=''):
   loss_name = name if name else 'pairwise_focal_loss'
+  assert 0 < ohem_ratio <= 1.0, loss_name + ' ohem_ratio must be in (0, 1]'
   logging.info(
-      '[{}] hinge margin: {}, gamma: {}, alpha: {}, ohem_ratio: {}'.format(
-          loss_name, hinge_margin, gamma, alpha, ohem_ratio))
-  assert 0 < ohem_ratio <= 1.0, 'ohem_ratio must be in (0, 1]'
+      '[{}] hinge margin: {}, gamma: {}, alpha: {}, ohem_ratio: {}, temperature: {}'
+      .format(loss_name, hinge_margin, gamma, alpha, ohem_ratio, temperature))
 
+  if temperature != 1.0:
+    logits /= temperature
   pairwise_logits = tf.expand_dims(logits, -1) - tf.expand_dims(logits, 0)
 
   pairwise_mask = tf.greater(
@@ -140,8 +149,11 @@ def pairwise_logistic_loss(labels,
     ohem_ratio: the percent of hard examples to be mined
     name: the name of loss
   """
-  assert 0 < ohem_ratio <= 1.0, 'ohem_ratio must be in (0, 1]'
   loss_name = name if name else 'pairwise_logistic_loss'
+  assert 0 < ohem_ratio <= 1.0, loss_name + ' ohem_ratio must be in (0, 1]'
+  logging.info('[{}] hinge margin: {}, ohem_ratio: {}, temperature: {}'.format(
+      loss_name, hinge_margin, ohem_ratio, temperature))
+
   if temperature != 1.0:
     logits /= temperature
   pairwise_logits = tf.math.subtract(
diff --git a/easy_rec/python/protos/loss.proto b/easy_rec/python/protos/loss.proto
index 7a6be0238..156eec5ae 100644
--- a/easy_rec/python/protos/loss.proto
+++ b/easy_rec/python/protos/loss.proto
@@ -67,6 +67,7 @@ message BinaryFocalLoss {
 message PairwiseLoss {
   required float margin = 1 [default = 0];
   optional string session_name = 2;
+  optional float temperature = 3 [default = 1.0];
 }
 
 message PairwiseFocalLoss {
@@ -75,6 +76,7 @@ message PairwiseFocalLoss {
   optional float hinge_margin = 3 [default = 1.0];
   optional string session_name = 4;
   optional float ohem_ratio = 5 [default = 1.0];
+  optional float temperature = 6 [default = 1.0];
 }
 
 message PairwiseLogisticLoss {

From d2793df09caa125c875f9dc5ace4569e7f34bb2d Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 21 Mar 2023 10:45:47 +0800
Subject: [PATCH 08/54] [feat]: add pairwise logistic loss

---
 easy_rec/python/loss/pairwise_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/easy_rec/python/loss/pairwise_loss.py b/easy_rec/python/loss/pairwise_loss.py
index a54c6d0a7..a1eda5873 100644
--- a/easy_rec/python/loss/pairwise_loss.py
+++ b/easy_rec/python/loss/pairwise_loss.py
@@ -3,7 +3,7 @@
 import logging
 
 import tensorflow as tf
-from focal_loss import sigmoid_focal_loss_with_logits
+from easy_rec.python.loss.focal_loss import sigmoid_focal_loss_with_logits
 from tensorflow.python.ops.losses.losses_impl import compute_weighted_loss
 
 from easy_rec.python.utils.shape_utils import get_shape_list

From 31e25027e77df9904005d8da50cbcea16e270fee Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 21 Mar 2023 11:44:38 +0800
Subject: [PATCH 09/54] [feat]: add pairwise logistic loss

---
 easy_rec/python/loss/focal_loss.py    | 3 ++-
 easy_rec/python/loss/pairwise_loss.py | 8 ++++++--
 setup.cfg                             | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/easy_rec/python/loss/focal_loss.py b/easy_rec/python/loss/focal_loss.py
index 515cb506b..4d3c13140 100644
--- a/easy_rec/python/loss/focal_loss.py
+++ b/easy_rec/python/loss/focal_loss.py
@@ -85,7 +85,8 @@ def sigmoid_focal_loss_with_logits(labels,
       weights=weights,
       label_smoothing=label_smoothing,
       reduction=tf.losses.Reduction.NONE)
-  k = tf.size(losses) * ohem_ratio
+  k = tf.size(losses, out_type=tf.float32) * tf.convert_to_tensor(ohem_ratio)
+  k = tf.to_int32(tf.math.rint(k))
   topk = tf.nn.top_k(losses, k)
   losses = tf.boolean_mask(topk.values, topk.values > 0)
   return tf.reduce_mean(losses)
diff --git a/easy_rec/python/loss/pairwise_loss.py b/easy_rec/python/loss/pairwise_loss.py
index a1eda5873..07d45896e 100644
--- a/easy_rec/python/loss/pairwise_loss.py
+++ b/easy_rec/python/loss/pairwise_loss.py
@@ -3,9 +3,9 @@
 import logging
 
 import tensorflow as tf
-from easy_rec.python.loss.focal_loss import sigmoid_focal_loss_with_logits
 from tensorflow.python.ops.losses.losses_impl import compute_weighted_loss
 
+from easy_rec.python.loss.focal_loss import sigmoid_focal_loss_with_logits
 from easy_rec.python.utils.shape_utils import get_shape_list
 
 if tf.__version__ >= '2.0':
@@ -30,6 +30,9 @@ def pairwise_loss(labels,
     weights: sample weights
     name: the name of loss
   """
+  logging.warning(
+      'The old `pairwise_loss` is being deprecated. '
+      'Please use the new `pairwise_logistic_loss` or `pairwise_focal_loss`')
   loss_name = name if name else 'pairwise_loss'
   logging.info('[{}] margin: {}, temperature: {}'.format(
       loss_name, margin, temperature))
@@ -192,7 +195,8 @@ def pairwise_logistic_loss(labels,
 
   losses = compute_weighted_loss(
       losses, pairwise_weights, reduction=tf.losses.Reduction.NONE)
-  k = tf.size(losses) * ohem_ratio
+  k = tf.size(losses, out_type=tf.float32) * tf.convert_to_tensor(ohem_ratio)
+  k = tf.to_int32(tf.math.rint(k))
   topk = tf.nn.top_k(losses, k)
   losses = tf.boolean_mask(topk.values, topk.values > 0)
   return tf.reduce_mean(losses)
diff --git a/setup.cfg b/setup.cfg
index 469407312..b180b9fb1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ multi_line_output = 7
 force_single_line = true
 known_standard_library = setuptools
 known_first_party = easy_rec
-known_third_party = absl,common_io,docutils,focal_loss,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
+known_third_party = absl,common_io,docutils,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
 no_lines_before = LOCALFOLDER
 default_section = THIRDPARTY
 skip = easy_rec/python/protos

From 7eb9c5c66da785b5e3e095001fa002d5a382683b Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 21 Mar 2023 18:42:11 +0800
Subject: [PATCH 10/54] [feat]: add pairwise logistic loss

---
 easy_rec/python/loss/focal_loss.py        |  7 ++++---
 easy_rec/python/loss/pairwise_loss.py     |  2 +-
 easy_rec/python/model/multi_task_model.py |  1 +
 easy_rec/python/model/rank_model.py       | 11 ++++++++---
 easy_rec/python/protos/loss.proto         |  1 +
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/easy_rec/python/loss/focal_loss.py b/easy_rec/python/loss/focal_loss.py
index 4d3c13140..9ef6a94a7 100644
--- a/easy_rec/python/loss/focal_loss.py
+++ b/easy_rec/python/loss/focal_loss.py
@@ -69,10 +69,11 @@ def sigmoid_focal_loss_with_logits(labels,
     weights *= alpha_factor
 
   if sample_weights is not None:
-    logging.info('[%s] use sample weight' % loss_name)
     if tf.is_numeric_tensor(sample_weights):
+      logging.info('[%s] use sample weight' % loss_name)
       weights *= tf.cast(sample_weights, tf.float32)
-    else:
+    elif sample_weights != 1.0:
+      logging.info('[%s] use sample weight: %f' % (loss_name, sample_weights))
       weights *= sample_weights
 
   if ohem_ratio == 1.0:
@@ -85,7 +86,7 @@ def sigmoid_focal_loss_with_logits(labels,
       weights=weights,
       label_smoothing=label_smoothing,
       reduction=tf.losses.Reduction.NONE)
-  k = tf.size(losses, out_type=tf.float32) * tf.convert_to_tensor(ohem_ratio)
+  k = tf.to_float(tf.size(losses)) * tf.convert_to_tensor(ohem_ratio)
   k = tf.to_int32(tf.math.rint(k))
   topk = tf.nn.top_k(losses, k)
   losses = tf.boolean_mask(topk.values, topk.values > 0)
diff --git a/easy_rec/python/loss/pairwise_loss.py b/easy_rec/python/loss/pairwise_loss.py
index 07d45896e..a421cdbba 100644
--- a/easy_rec/python/loss/pairwise_loss.py
+++ b/easy_rec/python/loss/pairwise_loss.py
@@ -195,7 +195,7 @@ def pairwise_logistic_loss(labels,
 
   losses = compute_weighted_loss(
       losses, pairwise_weights, reduction=tf.losses.Reduction.NONE)
-  k = tf.size(losses, out_type=tf.float32) * tf.convert_to_tensor(ohem_ratio)
+  k = tf.to_float(tf.size(losses)) * tf.convert_to_tensor(ohem_ratio)
   k = tf.to_int32(tf.math.rint(k))
   topk = tf.nn.top_k(losses, k)
   losses = tf.boolean_mask(topk.values, topk.values > 0)
diff --git a/easy_rec/python/model/multi_task_model.py b/easy_rec/python/model/multi_task_model.py
index de321ab7d..4ffd404d9 100644
--- a/easy_rec/python/model/multi_task_model.py
+++ b/easy_rec/python/model/multi_task_model.py
@@ -121,6 +121,7 @@ def build_loss_graph(self):
               loss_weight=loss_weight,
               num_class=task_tower_cfg.num_class,
               suffix='_%s' % tower_name,
+              loss_name=loss.loss_name,
               loss_param=loss_param)
           for loss_name, loss_value in loss_ops.items():
             loss_dict[loss_name] = loss_value * loss.weight
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index a4bce730e..b0463f10d 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -138,6 +138,7 @@ def _build_loss_impl(self,
                        loss_weight=1.0,
                        num_class=1,
                        suffix='',
+                       loss_name='',
                        loss_param=None):
     loss_dict = {}
     binary_loss_type = {
@@ -146,13 +147,16 @@ def _build_loss_impl(self,
         LossType.PAIRWISE_LOGISTIC_LOSS
     }
     if loss_type == LossType.CLASSIFICATION:
-      loss_name = 'cross_entropy_loss' + suffix
+      loss_name = loss_name if loss_name else 'cross_entropy_loss' + suffix
       pred = self._prediction_dict['logits' + suffix]
     elif loss_type in binary_loss_type:
-      loss_name = LossType.Name(loss_type).lower() + suffix
+      if not loss_name:
+        loss_name = LossType.Name(loss_type).lower() + suffix
+      else:
+        loss_name = loss_name + suffix
       pred = self._prediction_dict['logits' + suffix]
     elif loss_type in [LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS]:
-      loss_name = 'l2_loss' + suffix
+      loss_name = loss_name if loss_name else 'l2_loss' + suffix
       pred = self._prediction_dict['y' + suffix]
     else:
       raise ValueError('invalid loss type: %s' % LossType.Name(loss_type))
@@ -191,6 +195,7 @@ def build_loss_graph(self):
             label_name=self._label_name,
             loss_weight=self._sample_weight,
             num_class=self._num_class,
+            loss_name=loss.loss_name,
             loss_param=loss_param)
         for loss_name, loss_value in loss_ops.items():
           loss_dict[loss_name] = loss_value * loss.weight
diff --git a/easy_rec/python/protos/loss.proto b/easy_rec/python/protos/loss.proto
index 156eec5ae..7004972a0 100644
--- a/easy_rec/python/protos/loss.proto
+++ b/easy_rec/python/protos/loss.proto
@@ -21,6 +21,7 @@ enum LossType {
 message Loss {
   required LossType loss_type = 1;
   required float weight = 2 [default = 1.0];
+  optional string loss_name = 3;
   oneof loss_param {
     F1ReweighedLoss f1_reweighted_loss = 101;
     SoftmaxCrossEntropyWithNegativeMining softmax_loss = 102;

From 547c807db6e12579d9fb4f270edb6b1dd8a774c2 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 4 Apr 2023 19:31:41 +0800
Subject: [PATCH 11/54] [feat]: add jrc loss

---
 easy_rec/python/builders/loss_builder.py   |   6 +
 easy_rec/python/input/input.py             | 121 ++++++++++-----------
 easy_rec/python/layers/din.py              |  17 ++-
 easy_rec/python/loss/jrc_loss.py           |  62 +++++++++++
 easy_rec/python/main.py                    |   1 -
 easy_rec/python/model/multi_task_model.py  |  11 +-
 easy_rec/python/model/rank_model.py        |  23 +++-
 easy_rec/python/protos/loss.proto          |  10 +-
 easy_rec/python/tools/feature_selection.py |   2 +-
 easy_rec/python/utils/load_class.py        |   2 +
 pai_jobs/run.py                            |   2 +-
 11 files changed, 183 insertions(+), 74 deletions(-)
 create mode 100644 easy_rec/python/loss/jrc_loss.py

diff --git a/easy_rec/python/builders/loss_builder.py b/easy_rec/python/builders/loss_builder.py
index 203e3279d..5427c0d54 100644
--- a/easy_rec/python/builders/loss_builder.py
+++ b/easy_rec/python/builders/loss_builder.py
@@ -8,6 +8,7 @@
 from easy_rec.python.loss.pairwise_loss import pairwise_focal_loss
 from easy_rec.python.loss.pairwise_loss import pairwise_logistic_loss
 from easy_rec.python.loss.pairwise_loss import pairwise_loss
+from easy_rec.python.loss.jrc_loss import jrc_loss
 from easy_rec.python.protos.loss_pb2 import LossType
 
 from easy_rec.python.loss.f1_reweight_loss import f1_reweight_sigmoid_cross_entropy  # NOQA
@@ -39,6 +40,11 @@ def build(loss_type,
     logging.info('%s is used' % LossType.Name(loss_type))
     return tf.losses.mean_squared_error(
         labels=label, predictions=pred, weights=loss_weight, **kwargs)
+  elif loss_type == LossType.JRC_LOSS:
+    alpha = 0.5 if loss_param is None else loss_param.alpha
+    auto_weight = False if loss_param is None else not loss_param.HasField('alpha')
+    session = kwargs.get('session_ids', None)
+    return jrc_loss(label, pred, session, alpha, auto_weight=auto_weight, name=loss_name)
   elif loss_type == LossType.PAIR_WISE_LOSS:
     session = kwargs.get('session_ids', None)
     margin = 0 if loss_param is None else loss_param.margin
diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index 739024486..09d4c299b 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -127,11 +127,11 @@ def __init__(self,
       metrics = self._pipeline_config.eval_config.metrics_set
       for metric in metrics:
         metric_name = metric.WhichOneof('metric')
-        if metric_name == 'GAUC':
+        if metric_name == 'gauc':
           uid = metric.gauc.uid_field
           if uid not in self._effective_fields:
             self._effective_fields.append(uid)
-        elif metric_name == 'SessionAUC':
+        elif metric_name == 'session_auc':
           sid = metric.session_auc.session_id_field
           if sid not in self._effective_fields:
             self._effective_fields.append(sid)
@@ -139,27 +139,17 @@ def __init__(self,
       # check multi task model's metrics
       model_config = self._pipeline_config.model_config
       model_name = model_config.WhichOneof('model')
-      model = None
-      if model_name == 'MMoE':
-        model = model_config.mmoe
-      elif model_name == 'ESMM':
-        model = model_config.esmm
-      elif model_name == 'DBMTL':
-        model = model_config.dbmtl
-      elif model_name == 'SimpleMultiTask':
-        model = model_config.simple_multi_task
-      elif model_name == 'PLE':
-        model = model_config.ple
-      if model is not None:
+      if model_name in {'mmoe', 'esmm', 'dbmtl', 'simple_multi_task', 'ple'}:
+        model = getattr(model_config, model_name)
         for tower in model.task_towers:
           metrics = tower.metrics_set
           for metric in metrics:
             metric_name = metric.WhichOneof('metric')
-            if metric_name == 'GAUC':
+            if metric_name == 'gauc':
               uid = metric.gauc.uid_field
               if uid not in self._effective_fields:
                 self._effective_fields.append(uid)
-            elif metric_name == 'SessionAUC':
+            elif metric_name == 'session_auc':
               sid = metric.session_auc.session_id_field
               if sid not in self._effective_fields:
                 self._effective_fields.append(sid)
@@ -482,52 +472,22 @@ def _parse_raw_feature(self, fc, parsed_dict, field_dict):
     input_0 = fc.input_names[0]
     feature_name = fc.feature_name if fc.HasField('feature_name') else input_0
     if field_dict[input_0].dtype == tf.string:
-
-      def combine(x):
-        seq = tf.string_split([x], fc.seq_multi_sep)
-        seq_len = tf.size(seq)
-        if fc.raw_input_dim > 1:
-          check_list = [
-              tf.py_func(
-                  check_split,
-                  [seq.values, fc.separator, fc.raw_input_dim, input_0],
-                  Tout=tf.bool)
-          ] if self._check_mode else []
-          with tf.control_dependencies(check_list):
-            emb = tf.string_split(seq.values, fc.separator).values
-        else:
-          emb = seq.values
-        check_list = [
-            tf.py_func(check_string_to_number, [emb, input_0], Tout=tf.bool)
-        ] if self._check_mode else []
-        with tf.control_dependencies(check_list):
-          emb_val = tf.string_to_number(emb)
-        emb_vec = tf.reshape(emb_val, [seq_len, -1])
-
-        if fc.combiner == 'max':
-          emb_vec = tf.reduce_max(emb_vec, axis=0)
-        elif fc.combiner == 'min':
-          emb_vec = tf.reduce_min(emb_vec, axis=0)
-        elif fc.combiner == 'sum':
-          emb_vec = tf.reduce_sum(emb_vec, axis=0)
-        elif fc.combiner == 'mean':
-          emb_vec = tf.reduce_mean(emb_vec, axis=0)
-        else:
-          assert False, 'unsupported combine operator: ' + fc.combiner
-        return emb_vec
-
       if fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
-        parsed_dict[feature_name] = tf.map_fn(
-            combine, field_dict[input_0], dtype=tf.float32)
-      elif fc.raw_input_dim > 1:
+        fea = tf.string_split(field_dict[input_0], fc.seq_multi_sep)
+        segment_ids = fea.indices[:, 0]
+        vals = fea.values
+      else:
+        vals = field_dict[input_0]
+        segment_ids = tf.range(0, tf.shape(vals)[0])
+      if fc.raw_input_dim > 1:
         check_list = [
             tf.py_func(
                 check_split,
-                [field_dict[input_0], fc.separator, fc.raw_input_dim, input_0],
+                [vals, fc.separator, fc.raw_input_dim, input_0],
                 Tout=tf.bool)
         ] if self._check_mode else []
         with tf.control_dependencies(check_list):
-          tmp_fea = tf.string_split(field_dict[input_0], fc.separator)
+          tmp_fea = tf.string_split(vals, fc.separator)
         check_list = [
             tf.py_func(
                 check_string_to_number, [tmp_fea.values, input_0], Tout=tf.bool)
@@ -537,19 +497,53 @@ def combine(x):
               tmp_fea.values,
               tf.float32,
               name='multi_raw_fea_to_flt_%s' % input_0)
-        parsed_dict[feature_name] = tf.sparse_to_dense(
+        if fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
+          emb = tf.reshape(tmp_vals, [-1, fc.raw_input_dim])
+          if fc.combiner == 'max':
+            emb = tf.segment_max(emb, segment_ids)
+          elif fc.combiner == 'sum':
+            emb = tf.segment_sum(emb, segment_ids)
+          elif fc.combiner == 'min':
+            emb = tf.segment_min(emb, segment_ids)
+          elif fc.combiner == 'mean':
+            emb = tf.segment_mean(emb, segment_ids)
+          else:
+            assert False, 'unsupported combine operator: ' + fc.combiner
+          parsed_dict[feature_name] = emb
+        else:
+          parsed_dict[feature_name] = tf.sparse_to_dense(
             tmp_fea.indices,
             [tf.shape(field_dict[input_0])[0], fc.raw_input_dim],
             tmp_vals,
             default_value=0)
-      else:
+      elif fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
         check_list = [
-            tf.py_func(
-                check_string_to_number, [field_dict[input_0], input_0],
-                Tout=tf.bool)
+          tf.py_func(
+            check_string_to_number, [vals, input_0],
+            Tout=tf.bool)
         ] if self._check_mode else []
         with tf.control_dependencies(check_list):
-          parsed_dict[feature_name] = tf.string_to_number(
+          emb = tf.string_to_number(vals, tf.float32,
+                                    name='raw_fea_to_flt_%s' % input_0)
+        if fc.combiner == 'max':
+          emb = tf.segment_max(emb, segment_ids)
+        elif fc.combiner == 'sum':
+          emb = tf.segment_sum(emb, segment_ids)
+        elif fc.combiner == 'min':
+          emb = tf.segment_min(emb, segment_ids)
+        elif fc.combiner == 'mean':
+          emb = tf.segment_mean(emb, segment_ids)
+        else:
+          assert False, 'unsupported combine operator: ' + fc.combiner
+        parsed_dict[feature_name] = emb
+      else:
+         check_list = [
+           tf.py_func(
+             check_string_to_number, [field_dict[input_0], input_0],
+             Tout=tf.bool)
+         ] if self._check_mode else []
+         with tf.control_dependencies(check_list):
+           parsed_dict[feature_name] = tf.string_to_number(
               field_dict[input_0], tf.float32)
     elif field_dict[input_0].dtype in [
         tf.int32, tf.int64, tf.double, tf.float32
@@ -563,7 +557,7 @@ def combine(x):
           fc.max_val - fc.min_val)
 
     if fc.HasField('normalizer_fn'):
-      logging.info('apply normalizer_fn %s' % fc.normalizer_fn)
+      logging.info('apply normalizer_fn %s to `%s`' % (fc.normalizer_fn, feature_name))
       parsed_dict[feature_name] = self._normalizer_fn[feature_name](
           parsed_dict[feature_name])
 
@@ -845,6 +839,9 @@ def _preprocess(self, field_dict):
       if self._mode != tf.estimator.ModeKeys.PREDICT:
         parsed_dict[constant.SAMPLE_WEIGHT] = field_dict[
             self._data_config.sample_weight]
+
+    if Input.DATA_OFFSET in field_dict:
+      parsed_dict[Input.DATA_OFFSET] = field_dict[Input.DATA_OFFSET]
     return {'feature': parsed_dict, 'label': label_dict}
 
   def _lookup_preprocess(self, fc, field_dict):
diff --git a/easy_rec/python/layers/din.py b/easy_rec/python/layers/din.py
index 60d106fe3..717dd9789 100644
--- a/easy_rec/python/layers/din.py
+++ b/easy_rec/python/layers/din.py
@@ -1,5 +1,7 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+
 import tensorflow as tf
 
 from easy_rec.python.layers import dnn
@@ -21,13 +23,19 @@ def __call__(self, inputs, training=None, **kwargs):
     seq_input = [seq_fea for seq_fea, _ in seq_features]
     keys = tf.concat(seq_input, axis=-1)
 
+    query = target_feature
     target_emb_size = target_feature.shape.as_list()[-1]
     seq_emb_size = keys.shape.as_list()[-1]
-    assert target_emb_size == seq_emb_size, 'the embedding size of sequence and target item is not equal' \
-                                            ' in feature group:' + self.name
+    if target_emb_size != seq_emb_size:
+      logging.info('<din> the embedding size of sequence [%d] and target item [%d] is not equal'
+                   ' in feature group: %s', seq_emb_size, target_emb_size, self.name)
+      if target_emb_size < seq_emb_size:
+        query = tf.pad(target_feature, [[0, 0], [0, seq_emb_size-target_emb_size]])
+      else:
+        assert False, 'the embedding size of target item is larger than the one of sequence'
 
     batch_size, max_seq_len, _ = get_shape_list(keys, 3)
-    queries = tf.tile(tf.expand_dims(target_feature, 1), [1, max_seq_len, 1])
+    queries = tf.tile(tf.expand_dims(query, 1), [1, max_seq_len, 1])
     din_all = tf.concat([queries, keys, queries - keys, queries * keys],
                         axis=-1)
     din_layer = dnn.DNN(
@@ -48,6 +56,9 @@ def __call__(self, inputs, training=None, **kwargs):
     scores = scores / (seq_emb_size**0.5)
     # normalization with softmax is abandoned according to the original paper
     scores = tf.nn.sigmoid(scores)
+
+    if target_emb_size < seq_emb_size:
+      keys = keys[:, :, :target_emb_size]  # [B, L, E]
     output = tf.squeeze(tf.matmul(scores, keys), axis=[1])
     if self.config.need_target_feature:
       output = tf.concat([output, target_feature], axis=-1)
diff --git a/easy_rec/python/loss/jrc_loss.py b/easy_rec/python/loss/jrc_loss.py
new file mode 100644
index 000000000..930431da7
--- /dev/null
+++ b/easy_rec/python/loss/jrc_loss.py
@@ -0,0 +1,62 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+
+import tensorflow as tf
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name=''):
+  """Joint Optimization of Ranking and Calibration with Contextualized Hybrid Model.
+
+  Args:
+    labels: a `Tensor` with shape [batch_size]. e.g. click or not click in the session.
+    logits: a `Tensor` with shape [batch_size, 2]. e.g. the value of last neuron before activation.
+    session_ids: a `Tensor` with shape [batch_size, 1]. Session ids of each sample, used to max GAUC metric. e.g. user_id
+    alpha: the weight to balance ranking loss and calibration loss
+    auto_weight: bool, whether to learn loss weight between ranking loss and calibration loss
+    name: the name of loss
+  """
+  loss_name = name if name else 'jrc_loss'
+  logging.info('[{}] alpha: {}'.format(loss_name, alpha))
+
+  ce_loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+
+  labels = tf.expand_dims(labels, 1)  # [B, 1]
+  labels = tf.concat([1 - labels, labels], axis=1)  # [B, 2]
+
+  batch_size = tf.shape(logits)[0]
+
+  # Mask: shape [B, B], mask[i,j]=1 indicates the i-th sample
+  # and j-th sample are in the same context
+  mask = tf.equal(session_ids, tf.transpose(session_ids))
+
+  # Tile logits and label: [B, 2]->[B, B, 2]
+  logits = tf.tile(tf.expand_dims(logits, 1), [1, batch_size, 1])
+  y = tf.tile(tf.expand_dims(labels, 1), [1, batch_size, 1])
+
+  # Set logits that are not in the same context to -inf
+  mask3d = tf.expand_dims(mask, 2)
+  y = y * mask3d
+  logits = logits + (1 - mask) * -1e9
+  y_neg, y_pos = y[:, :, 0], y[:, :, 1]
+  l_neg, l_pos = logits[:, :, 0], logits[:, :, 1]
+
+  # Compute list-wise generative loss -log p(x|y, z)
+  loss_pos = -tf.sum(y_pos * tf.nn.log_softmax(l_pos, axis=0), axis=0)
+  loss_neg = -tf.sum(y_neg * tf.nn.log_softmax(l_neg, axis=0), axis=0)
+  ge_loss = tf.mean((loss_pos+loss_neg)/tf.sum(mask, axis=0))
+
+  # The final JRC model
+  if auto_weight:
+    uncertainty1 = tf.Variable(0, name="%s_ranking_loss_weight" % loss_name, dtype=tf.float32)
+    tf.summary.scalar('loss/%s_ranking_uncertainty' % loss_name, uncertainty1)
+    uncertainty2 = tf.Variable(0, name="%s_calibration_loss_weight" % loss_name, dtype=tf.float32)
+    tf.summary.scalar('loss/%s_calibration_uncertainty' % loss_name, uncertainty2)
+    loss = tf.exp(-uncertainty1) * ce_loss + 0.5 * uncertainty1
+    loss += tf.exp(-uncertainty2) * ge_loss + 0.5 * uncertainty2
+  else:
+    loss = alpha*ce_loss + (1-alpha)*ge_loss
+  return loss
diff --git a/easy_rec/python/main.py b/easy_rec/python/main.py
index d74e8fe6e..1c7b82637 100644
--- a/easy_rec/python/main.py
+++ b/easy_rec/python/main.py
@@ -610,7 +610,6 @@ def distribute_evaluate(pipeline_config,
   eval_result_file = os.path.join(model_dir, eval_result_filename)
   logging.info('save eval result to file %s' % eval_result_file)
   if cur_job_name == 'master':
-    print('eval_result = ', eval_result)
     logging.info('eval_result = {0}'.format(eval_result))
     with gfile.GFile(eval_result_file, 'w') as ofile:
       result_to_write = {'eval_method': 'distribute'}
diff --git a/easy_rec/python/model/multi_task_model.py b/easy_rec/python/model/multi_task_model.py
index 4ffd404d9..0e49249ea 100644
--- a/easy_rec/python/model/multi_task_model.py
+++ b/easy_rec/python/model/multi_task_model.py
@@ -7,6 +7,7 @@
 from easy_rec.python.builders import loss_builder
 from easy_rec.python.model.rank_model import RankModel
 from easy_rec.python.protos import tower_pb2
+from easy_rec.python.protos.loss_pb2 import LossType
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -124,7 +125,15 @@ def build_loss_graph(self):
               loss_name=loss.loss_name,
               loss_param=loss_param)
           for loss_name, loss_value in loss_ops.items():
-            loss_dict[loss_name] = loss_value * loss.weight
+            if loss.learn_loss_weight:
+              uncertainty = tf.Variable(0, name="%s_loss_weight" % loss_name, dtype=tf.float32)
+              tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
+              if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
+                loss_dict[loss_name] = 0.5 * tf.exp(-uncertainty) * loss_value + 0.5 * uncertainty
+              else:
+                loss_dict[loss_name] = tf.exp(-uncertainty) * loss_value + 0.5 * uncertainty
+            else:
+              loss_dict[loss_name] = loss_value * loss.weight
 
       self._loss_dict.update(loss_dict)
 
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index b0463f10d..7a2b0dc76 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -41,12 +41,18 @@ def _output_to_prediction_impl(self,
         LossType.PAIRWISE_LOGISTIC_LOSS
     }
     if loss_type in binary_loss_type:
-      assert num_class == 1, 'num_class must be 1 when loss type is F1_REWEIGHTED_LOSS/PAIR_WISE_LOSS'
+      assert num_class == 1, 'num_class must be 1 when loss type is %s' % loss_type.name
       output = tf.squeeze(output, axis=1)
       probs = tf.sigmoid(output)
       tf.summary.scalar('prediction/probs', tf.reduce_mean(probs))
       prediction_dict['logits' + suffix] = output
       prediction_dict['probs' + suffix] = probs
+    elif loss_type == LossType.JRC_LOSS:
+      assert num_class == 2, 'num_class must be 2 when loss type is JRC_LOSS'
+      probs = tf.nn.softmax(output, axis=1)
+      tf.summary.scalar('prediction/probs', tf.reduce_mean(probs[:, 1]))
+      prediction_dict['logits' + suffix] = output
+      prediction_dict['probs' + suffix] = probs[:, 1]
     elif loss_type == LossType.CLASSIFICATION:
       if num_class == 1:
         output = tf.squeeze(output, axis=1)
@@ -103,7 +109,8 @@ def build_rtp_output_dict(self):
       binary_loss_set = {
           LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
           LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
-          LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS
+          LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
+          LossType.JRC_LOSS
       }
       if loss_types & binary_loss_set:
         if 'probs' in self._prediction_dict:
@@ -144,7 +151,7 @@ def _build_loss_impl(self,
     binary_loss_type = {
         LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
         LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
-        LossType.PAIRWISE_LOGISTIC_LOSS
+        LossType.PAIRWISE_LOGISTIC_LOSS, LossType.JRC_LOSS
     }
     if loss_type == LossType.CLASSIFICATION:
       loss_name = loss_name if loss_name else 'cross_entropy_loss' + suffix
@@ -198,7 +205,15 @@ def build_loss_graph(self):
             loss_name=loss.loss_name,
             loss_param=loss_param)
         for loss_name, loss_value in loss_ops.items():
-          loss_dict[loss_name] = loss_value * loss.weight
+          if loss.learn_loss_weight:
+            uncertainty = tf.Variable(0, name="%s_loss_weight" % loss_name, dtype=tf.float32)
+            tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
+            if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
+              loss_dict[loss_name] = 0.5 * tf.exp(-uncertainty) * loss_value + 0.5 * uncertainty
+            else:
+              loss_dict[loss_name] = tf.exp(-uncertainty) * loss_value + 0.5 * uncertainty
+          else:
+            loss_dict[loss_name] = loss_value * loss.weight
 
     self._loss_dict.update(loss_dict)
 
diff --git a/easy_rec/python/protos/loss.proto b/easy_rec/python/protos/loss.proto
index 7004972a0..9ec7c78c9 100644
--- a/easy_rec/python/protos/loss.proto
+++ b/easy_rec/python/protos/loss.proto
@@ -16,12 +16,14 @@ enum LossType {
     BINARY_FOCAL_LOSS = 10;
     PAIRWISE_FOCAL_LOSS = 11;
     PAIRWISE_LOGISTIC_LOSS = 12;
+    JRC_LOSS = 13;
 }
 
 message Loss {
   required LossType loss_type = 1;
-  required float weight = 2 [default = 1.0];
+  optional float weight = 2 [default = 1.0];
   optional string loss_name = 3;
+  optional bool learn_loss_weight = 4 [default = false];
   oneof loss_param {
     F1ReweighedLoss f1_reweighted_loss = 101;
     SoftmaxCrossEntropyWithNegativeMining softmax_loss = 102;
@@ -31,6 +33,7 @@ message Loss {
     PairwiseLoss pairwise_loss = 106;
     PairwiseFocalLoss pairwise_focal_loss = 107;
     PairwiseLogisticLoss pairwise_logistic_loss = 108;
+    JRCLoss jrc_loss = 109;
   }
 };
 
@@ -86,3 +89,8 @@ message PairwiseLogisticLoss {
   optional float hinge_margin = 3 [default = 1.0];
   optional float ohem_ratio = 4 [default = 1.0];
 }
+
+message JRCLoss {
+  required string session_name = 1;
+  optional float alpha = 2 [default = 0.5];
+}
\ No newline at end of file
diff --git a/easy_rec/python/tools/feature_selection.py b/easy_rec/python/tools/feature_selection.py
index 295698013..05b193897 100644
--- a/easy_rec/python/tools/feature_selection.py
+++ b/easy_rec/python/tools/feature_selection.py
@@ -20,7 +20,7 @@
 import matplotlib.pyplot as plt  # NOQA
 
 tf.app.flags.DEFINE_string('model_type', 'variational_dropout',
-                           'feature selection model tyoe')
+                           'feature selection model type')
 tf.app.flags.DEFINE_string('config_path', '',
                            'feature selection model config path')
 tf.app.flags.DEFINE_string('checkpoint_path', None,
diff --git a/easy_rec/python/utils/load_class.py b/easy_rec/python/utils/load_class.py
index 5db92a05f..2da1e4e41 100644
--- a/easy_rec/python/utils/load_class.py
+++ b/easy_rec/python/utils/load_class.py
@@ -37,6 +37,8 @@ def load_by_path(path):
   path = path.strip()
   if path == '' or path is None:
     return None
+  if 'lambda' in path:
+    return eval(path)
   components = path.split('.')
   if components[0] == 'tf':
     components[0] = 'tensorflow'
diff --git a/pai_jobs/run.py b/pai_jobs/run.py
index e0e861a97..ed02c73c5 100644
--- a/pai_jobs/run.py
+++ b/pai_jobs/run.py
@@ -381,7 +381,7 @@ def main(argv):
     # TODO: support multi-worker evaluation
     if not FLAGS.distribute_eval:
       assert len(
-          FLAGS.worker_hosts.split(',')) == 1, 'evaluate only need 1 woker'
+          FLAGS.worker_hosts.split(',')) == 1, 'evaluate only need 1 worker'
     config_util.auto_expand_share_feature_configs(pipeline_config)
 
     if FLAGS.eval_tables:

From c7476bba10c9e7ad0d2b86481dd558efbb8d6b4b Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 4 Apr 2023 19:39:13 +0800
Subject: [PATCH 12/54] [feat]: add jrc loss

---
 easy_rec/python/loss/jrc_loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/easy_rec/python/loss/jrc_loss.py b/easy_rec/python/loss/jrc_loss.py
index 930431da7..51cad279b 100644
--- a/easy_rec/python/loss/jrc_loss.py
+++ b/easy_rec/python/loss/jrc_loss.py
@@ -14,7 +14,7 @@ def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name='')
   Args:
     labels: a `Tensor` with shape [batch_size]. e.g. click or not click in the session.
     logits: a `Tensor` with shape [batch_size, 2]. e.g. the value of last neuron before activation.
-    session_ids: a `Tensor` with shape [batch_size, 1]. Session ids of each sample, used to max GAUC metric. e.g. user_id
+    session_ids: a `Tensor` with shape [batch_size]. Session ids of each sample, used to max GAUC metric. e.g. user_id
     alpha: the weight to balance ranking loss and calibration loss
     auto_weight: bool, whether to learn loss weight between ranking loss and calibration loss
     name: the name of loss
@@ -31,7 +31,7 @@ def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name='')
 
   # Mask: shape [B, B], mask[i,j]=1 indicates the i-th sample
   # and j-th sample are in the same context
-  mask = tf.equal(session_ids, tf.transpose(session_ids))
+  mask = tf.equal(tf.expand_dims(session_ids, 1), tf.expand_dims(session_ids, 0))
 
   # Tile logits and label: [B, 2]->[B, B, 2]
   logits = tf.tile(tf.expand_dims(logits, 1), [1, batch_size, 1])

From 7f6ee53b1fd8bd21d1b4125966c86e6f483a83bc Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 4 Apr 2023 20:01:01 +0800
Subject: [PATCH 13/54] [feat]: add jrc loss

---
 easy_rec/python/loss/jrc_loss.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/easy_rec/python/loss/jrc_loss.py b/easy_rec/python/loss/jrc_loss.py
index 51cad279b..18b372507 100644
--- a/easy_rec/python/loss/jrc_loss.py
+++ b/easy_rec/python/loss/jrc_loss.py
@@ -11,6 +11,8 @@
 def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name=''):
   """Joint Optimization of Ranking and Calibration with Contextualized Hybrid Model.
 
+     https://arxiv.org/abs/2208.06164
+
   Args:
     labels: a `Tensor` with shape [batch_size]. e.g. click or not click in the session.
     logits: a `Tensor` with shape [batch_size, 2]. e.g. the value of last neuron before activation.
@@ -32,6 +34,7 @@ def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name='')
   # Mask: shape [B, B], mask[i,j]=1 indicates the i-th sample
   # and j-th sample are in the same context
   mask = tf.equal(tf.expand_dims(session_ids, 1), tf.expand_dims(session_ids, 0))
+  mask = tf.to_float(mask)
 
   # Tile logits and label: [B, 2]->[B, B, 2]
   logits = tf.tile(tf.expand_dims(logits, 1), [1, batch_size, 1])
@@ -40,7 +43,7 @@ def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name='')
   # Set logits that are not in the same context to -inf
   mask3d = tf.expand_dims(mask, 2)
   y = y * mask3d
-  logits = logits + (1 - mask) * -1e9
+  logits = logits + (1 - mask3d) * -1e9
   y_neg, y_pos = y[:, :, 0], y[:, :, 1]
   l_neg, l_pos = logits[:, :, 0], logits[:, :, 1]
 

From 98f9ec43fae686889f105ca09aa847805439bb43 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 7 Apr 2023 11:54:08 +0800
Subject: [PATCH 14/54] [feat]: add jrc loss

---
 easy_rec/python/builders/loss_builder.py  |  8 +++--
 easy_rec/python/input/input.py            | 38 +++++++++++------------
 easy_rec/python/layers/din.py             |  8 +++--
 easy_rec/python/loss/jrc_loss.py          | 32 ++++++++++++-------
 easy_rec/python/model/multi_task_model.py |  9 ++++--
 easy_rec/python/model/rank_model.py       | 28 ++++++++++-------
 easy_rec/python/protos/loss.proto         |  2 +-
 7 files changed, 74 insertions(+), 51 deletions(-)

diff --git a/easy_rec/python/builders/loss_builder.py b/easy_rec/python/builders/loss_builder.py
index 5427c0d54..cb10d870d 100644
--- a/easy_rec/python/builders/loss_builder.py
+++ b/easy_rec/python/builders/loss_builder.py
@@ -5,10 +5,10 @@
 import tensorflow as tf
 
 from easy_rec.python.loss.focal_loss import sigmoid_focal_loss_with_logits
+from easy_rec.python.loss.jrc_loss import jrc_loss
 from easy_rec.python.loss.pairwise_loss import pairwise_focal_loss
 from easy_rec.python.loss.pairwise_loss import pairwise_logistic_loss
 from easy_rec.python.loss.pairwise_loss import pairwise_loss
-from easy_rec.python.loss.jrc_loss import jrc_loss
 from easy_rec.python.protos.loss_pb2 import LossType
 
 from easy_rec.python.loss.f1_reweight_loss import f1_reweight_sigmoid_cross_entropy  # NOQA
@@ -42,9 +42,11 @@ def build(loss_type,
         labels=label, predictions=pred, weights=loss_weight, **kwargs)
   elif loss_type == LossType.JRC_LOSS:
     alpha = 0.5 if loss_param is None else loss_param.alpha
-    auto_weight = False if loss_param is None else not loss_param.HasField('alpha')
+    auto_weight = False if loss_param is None else not loss_param.HasField(
+        'alpha')
     session = kwargs.get('session_ids', None)
-    return jrc_loss(label, pred, session, alpha, auto_weight=auto_weight, name=loss_name)
+    return jrc_loss(
+        label, pred, session, alpha, auto_weight=auto_weight, name=loss_name)
   elif loss_type == LossType.PAIR_WISE_LOSS:
     session = kwargs.get('session_ids', None)
     margin = 0 if loss_param is None else loss_param.margin
diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index 09d4c299b..bef412460 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -253,6 +253,8 @@ def create_multi_placeholders(self, export_config):
     inputs = {}
     for fid in effective_fids:
       input_name = self._input_fields[fid]
+      if input_name == sample_weight_field:
+        continue
       if placeholder_named_by_input:
         placeholder_name = input_name
       else:
@@ -482,8 +484,7 @@ def _parse_raw_feature(self, fc, parsed_dict, field_dict):
       if fc.raw_input_dim > 1:
         check_list = [
             tf.py_func(
-                check_split,
-                [vals, fc.separator, fc.raw_input_dim, input_0],
+                check_split, [vals, fc.separator, fc.raw_input_dim, input_0],
                 Tout=tf.bool)
         ] if self._check_mode else []
         with tf.control_dependencies(check_list):
@@ -512,19 +513,17 @@ def _parse_raw_feature(self, fc, parsed_dict, field_dict):
           parsed_dict[feature_name] = emb
         else:
           parsed_dict[feature_name] = tf.sparse_to_dense(
-            tmp_fea.indices,
-            [tf.shape(field_dict[input_0])[0], fc.raw_input_dim],
-            tmp_vals,
-            default_value=0)
+              tmp_fea.indices,
+              [tf.shape(field_dict[input_0])[0], fc.raw_input_dim],
+              tmp_vals,
+              default_value=0)
       elif fc.HasField('seq_multi_sep') and fc.HasField('combiner'):
         check_list = [
-          tf.py_func(
-            check_string_to_number, [vals, input_0],
-            Tout=tf.bool)
+            tf.py_func(check_string_to_number, [vals, input_0], Tout=tf.bool)
         ] if self._check_mode else []
         with tf.control_dependencies(check_list):
-          emb = tf.string_to_number(vals, tf.float32,
-                                    name='raw_fea_to_flt_%s' % input_0)
+          emb = tf.string_to_number(
+              vals, tf.float32, name='raw_fea_to_flt_%s' % input_0)
         if fc.combiner == 'max':
           emb = tf.segment_max(emb, segment_ids)
         elif fc.combiner == 'sum':
@@ -537,13 +536,13 @@ def _parse_raw_feature(self, fc, parsed_dict, field_dict):
           assert False, 'unsupported combine operator: ' + fc.combiner
         parsed_dict[feature_name] = emb
       else:
-         check_list = [
-           tf.py_func(
-             check_string_to_number, [field_dict[input_0], input_0],
-             Tout=tf.bool)
-         ] if self._check_mode else []
-         with tf.control_dependencies(check_list):
-           parsed_dict[feature_name] = tf.string_to_number(
+        check_list = [
+            tf.py_func(
+                check_string_to_number, [field_dict[input_0], input_0],
+                Tout=tf.bool)
+        ] if self._check_mode else []
+        with tf.control_dependencies(check_list):
+          parsed_dict[feature_name] = tf.string_to_number(
               field_dict[input_0], tf.float32)
     elif field_dict[input_0].dtype in [
         tf.int32, tf.int64, tf.double, tf.float32
@@ -557,7 +556,8 @@ def _parse_raw_feature(self, fc, parsed_dict, field_dict):
           fc.max_val - fc.min_val)
 
     if fc.HasField('normalizer_fn'):
-      logging.info('apply normalizer_fn %s to `%s`' % (fc.normalizer_fn, feature_name))
+      logging.info('apply normalizer_fn %s to `%s`' %
+                   (fc.normalizer_fn, feature_name))
       parsed_dict[feature_name] = self._normalizer_fn[feature_name](
           parsed_dict[feature_name])
 
diff --git a/easy_rec/python/layers/din.py b/easy_rec/python/layers/din.py
index 717dd9789..81f661165 100644
--- a/easy_rec/python/layers/din.py
+++ b/easy_rec/python/layers/din.py
@@ -27,10 +27,12 @@ def __call__(self, inputs, training=None, **kwargs):
     target_emb_size = target_feature.shape.as_list()[-1]
     seq_emb_size = keys.shape.as_list()[-1]
     if target_emb_size != seq_emb_size:
-      logging.info('<din> the embedding size of sequence [%d] and target item [%d] is not equal'
-                   ' in feature group: %s', seq_emb_size, target_emb_size, self.name)
+      logging.info(
+          '<din> the embedding size of sequence [%d] and target item [%d] is not equal'
+          ' in feature group: %s', seq_emb_size, target_emb_size, self.name)
       if target_emb_size < seq_emb_size:
-        query = tf.pad(target_feature, [[0, 0], [0, seq_emb_size-target_emb_size]])
+        query = tf.pad(target_feature,
+                       [[0, 0], [0, seq_emb_size - target_emb_size]])
       else:
         assert False, 'the embedding size of target item is larger than the one of sequence'
 
diff --git a/easy_rec/python/loss/jrc_loss.py b/easy_rec/python/loss/jrc_loss.py
index 18b372507..fc8266b2c 100644
--- a/easy_rec/python/loss/jrc_loss.py
+++ b/easy_rec/python/loss/jrc_loss.py
@@ -8,7 +8,12 @@
   tf = tf.compat.v1
 
 
-def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name=''):
+def jrc_loss(labels,
+             logits,
+             session_ids,
+             alpha=0.5,
+             auto_weight=False,
+             name=''):
   """Joint Optimization of Ranking and Calibration with Contextualized Hybrid Model.
 
      https://arxiv.org/abs/2208.06164
@@ -22,7 +27,8 @@ def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name='')
     name: the name of loss
   """
   loss_name = name if name else 'jrc_loss'
-  logging.info('[{}] alpha: {}'.format(loss_name, alpha))
+  logging.info('[{}] alpha: {}, auto_weight: {}'.format(loss_name, alpha,
+                                                        auto_weight))
 
   ce_loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
 
@@ -33,7 +39,8 @@ def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name='')
 
   # Mask: shape [B, B], mask[i,j]=1 indicates the i-th sample
   # and j-th sample are in the same context
-  mask = tf.equal(tf.expand_dims(session_ids, 1), tf.expand_dims(session_ids, 0))
+  mask = tf.equal(
+      tf.expand_dims(session_ids, 1), tf.expand_dims(session_ids, 0))
   mask = tf.to_float(mask)
 
   # Tile logits and label: [B, 2]->[B, B, 2]
@@ -42,24 +49,27 @@ def jrc_loss(labels, logits, session_ids, alpha=0.5, auto_weight=False, name='')
 
   # Set logits that are not in the same context to -inf
   mask3d = tf.expand_dims(mask, 2)
-  y = y * mask3d
+  y = tf.to_float(y) * mask3d
   logits = logits + (1 - mask3d) * -1e9
   y_neg, y_pos = y[:, :, 0], y[:, :, 1]
   l_neg, l_pos = logits[:, :, 0], logits[:, :, 1]
 
   # Compute list-wise generative loss -log p(x|y, z)
-  loss_pos = -tf.sum(y_pos * tf.nn.log_softmax(l_pos, axis=0), axis=0)
-  loss_neg = -tf.sum(y_neg * tf.nn.log_softmax(l_neg, axis=0), axis=0)
-  ge_loss = tf.mean((loss_pos+loss_neg)/tf.sum(mask, axis=0))
+  loss_pos = -tf.reduce_sum(y_pos * tf.nn.log_softmax(l_pos, axis=0), axis=0)
+  loss_neg = -tf.reduce_sum(y_neg * tf.nn.log_softmax(l_neg, axis=0), axis=0)
+  ge_loss = tf.reduce_mean((loss_pos + loss_neg) / tf.reduce_sum(mask, axis=0))
 
   # The final JRC model
   if auto_weight:
-    uncertainty1 = tf.Variable(0, name="%s_ranking_loss_weight" % loss_name, dtype=tf.float32)
+    uncertainty1 = tf.Variable(
+        0, name='%s_ranking_loss_weight' % loss_name, dtype=tf.float32)
     tf.summary.scalar('loss/%s_ranking_uncertainty' % loss_name, uncertainty1)
-    uncertainty2 = tf.Variable(0, name="%s_calibration_loss_weight" % loss_name, dtype=tf.float32)
-    tf.summary.scalar('loss/%s_calibration_uncertainty' % loss_name, uncertainty2)
+    uncertainty2 = tf.Variable(
+        0, name='%s_calibration_loss_weight' % loss_name, dtype=tf.float32)
+    tf.summary.scalar('loss/%s_calibration_uncertainty' % loss_name,
+                      uncertainty2)
     loss = tf.exp(-uncertainty1) * ce_loss + 0.5 * uncertainty1
     loss += tf.exp(-uncertainty2) * ge_loss + 0.5 * uncertainty2
   else:
-    loss = alpha*ce_loss + (1-alpha)*ge_loss
+    loss = alpha * ce_loss + (1 - alpha) * ge_loss
   return loss
diff --git a/easy_rec/python/model/multi_task_model.py b/easy_rec/python/model/multi_task_model.py
index 0e49249ea..43e5663ce 100644
--- a/easy_rec/python/model/multi_task_model.py
+++ b/easy_rec/python/model/multi_task_model.py
@@ -126,12 +126,15 @@ def build_loss_graph(self):
               loss_param=loss_param)
           for loss_name, loss_value in loss_ops.items():
             if loss.learn_loss_weight:
-              uncertainty = tf.Variable(0, name="%s_loss_weight" % loss_name, dtype=tf.float32)
+              uncertainty = tf.Variable(
+                  0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
               tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
               if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
-                loss_dict[loss_name] = 0.5 * tf.exp(-uncertainty) * loss_value + 0.5 * uncertainty
+                loss_dict[loss_name] = 0.5 * tf.exp(
+                    -uncertainty) * loss_value + 0.5 * uncertainty
               else:
-                loss_dict[loss_name] = tf.exp(-uncertainty) * loss_value + 0.5 * uncertainty
+                loss_dict[loss_name] = tf.exp(
+                    -uncertainty) * loss_value + 0.5 * uncertainty
             else:
               loss_dict[loss_name] = loss_value * loss.weight
 
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index 7a2b0dc76..25eff23ea 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -206,12 +206,15 @@ def build_loss_graph(self):
             loss_param=loss_param)
         for loss_name, loss_value in loss_ops.items():
           if loss.learn_loss_weight:
-            uncertainty = tf.Variable(0, name="%s_loss_weight" % loss_name, dtype=tf.float32)
+            uncertainty = tf.Variable(
+                0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
             tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
             if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
-              loss_dict[loss_name] = 0.5 * tf.exp(-uncertainty) * loss_value + 0.5 * uncertainty
+              loss_dict[loss_name] = 0.5 * tf.exp(
+                  -uncertainty) * loss_value + 0.5 * uncertainty
             else:
-              loss_dict[loss_name] = tf.exp(-uncertainty) * loss_value + 0.5 * uncertainty
+              loss_dict[loss_name] = tf.exp(
+                  -uncertainty) * loss_value + 0.5 * uncertainty
           else:
             loss_dict[loss_name] = loss_value * loss.weight
 
@@ -237,12 +240,13 @@ def _build_metric_impl(self,
     binary_loss_set = {
         LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
         LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
-        LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS
+        LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
+        LossType.JRC_LOSS
     }
     metric_dict = {}
     if metric.WhichOneof('metric') == 'auc':
       assert loss_type & binary_loss_set
-      if num_class == 1:
+      if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['auc' + suffix] = metrics_tf.auc(
             label,
@@ -258,7 +262,7 @@ def _build_metric_impl(self,
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'gauc':
       assert loss_type & binary_loss_set
-      if num_class == 1:
+      if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         uids = self._feature_dict[metric.gauc.uid_field]
         if isinstance(uids, tf.sparse.SparseTensor):
@@ -281,7 +285,7 @@ def _build_metric_impl(self,
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'session_auc':
       assert loss_type & binary_loss_set
-      if num_class == 1:
+      if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['session_auc' + suffix] = metrics_lib.session_auc(
             label,
@@ -299,7 +303,7 @@ def _build_metric_impl(self,
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'max_f1':
       assert loss_type & binary_loss_set
-      if num_class == 1:
+      if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['max_f1' + suffix] = metrics_lib.max_f1(
             label, self._prediction_dict['logits' + suffix])
@@ -376,11 +380,13 @@ def build_metric_graph(self, eval_config):
 
   def _get_outputs_impl(self, loss_type, num_class=1, suffix=''):
     binary_loss_set = {
-        LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
-        LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
-        LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS
+        LossType.F1_REWEIGHTED_LOSS, LossType.JRC_LOSS, LossType.PAIR_WISE_LOSS,
+        LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
+        LossType.PAIRWISE_LOGISTIC_LOSS
     }
     if loss_type in binary_loss_set:
+      return ['probs' + suffix, 'logits' + suffix]
+    if loss_type == LossType.CLASSIFICATION:
       if num_class == 1:
         return ['probs' + suffix, 'logits' + suffix]
       else:
diff --git a/easy_rec/python/protos/loss.proto b/easy_rec/python/protos/loss.proto
index 9ec7c78c9..c5b74f47d 100644
--- a/easy_rec/python/protos/loss.proto
+++ b/easy_rec/python/protos/loss.proto
@@ -93,4 +93,4 @@ message PairwiseLogisticLoss {
 message JRCLoss {
   required string session_name = 1;
   optional float alpha = 2 [default = 0.5];
-}
\ No newline at end of file
+}

From 958621226f90faede38976f2f73aeaca731cf84d Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 7 Apr 2023 20:36:49 +0800
Subject: [PATCH 15/54] [feat]: add jrc loss

---
 easy_rec/python/builders/loss_builder.py      |   2 +-
 easy_rec/python/input/augment.py              |  55 ++++
 easy_rec/python/input/input.py                |   3 +-
 easy_rec/python/layers/bst.py                 | 110 ++++---
 .../layers/multihead_cross_attention.py       | 112 ++++---
 easy_rec/python/layers/sequence_encoder.py    |   2 +-
 easy_rec/python/loss/nce_loss.py              |  34 ++
 easy_rec/python/main.py                       |   1 +
 easy_rec/python/model/easy_rec_model.py       |   4 +-
 easy_rec/python/model/match_model.py          |   3 +-
 easy_rec/python/protos/layer.proto            |   4 +
 easy_rec/python/test/train_eval_test.py       |   5 +
 samples/model_config/bst_cl_on_taobao.config  | 304 ++++++++++++++++++
 13 files changed, 539 insertions(+), 100 deletions(-)
 create mode 100644 easy_rec/python/input/augment.py
 create mode 100644 easy_rec/python/loss/nce_loss.py
 create mode 100644 samples/model_config/bst_cl_on_taobao.config

diff --git a/easy_rec/python/builders/loss_builder.py b/easy_rec/python/builders/loss_builder.py
index cb10d870d..7459372a5 100644
--- a/easy_rec/python/builders/loss_builder.py
+++ b/easy_rec/python/builders/loss_builder.py
@@ -24,7 +24,7 @@ def build(loss_type,
           num_class=1,
           loss_param=None,
           **kwargs):
-  loss_name = kwargs.pop('loss_name')
+  loss_name = kwargs.pop('loss_name') if 'loss_name' in kwargs else 'unknown'
   if loss_type == LossType.CLASSIFICATION:
     if num_class == 1:
       return tf.losses.sigmoid_cross_entropy(
diff --git a/easy_rec/python/input/augment.py b/easy_rec/python/input/augment.py
new file mode 100644
index 000000000..75298c430
--- /dev/null
+++ b/easy_rec/python/input/augment.py
@@ -0,0 +1,55 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+from easy_rec.python.utils.shape_utils import get_shape_list
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+def assign(input_tensor, position=None, value=None):
+    input_tensor[tuple(position)] = value
+    return input_tensor
+
+
+def item_mask(aug_data, length, gamma=0.3):
+    length1 = tf.cast(length, dtype=tf.float32)
+    num_mask = tf.cast(tf.math.floor(length1 * gamma), dtype=tf.int32)
+    seq = tf.range(length, dtype=tf.int32)
+    mask_index = tf.random.shuffle(seq)[:num_mask]
+    masked_item_seq = aug_data
+    masked_item_seq = tf.py_func(assign, inp=[masked_item_seq, [mask_index], 0], Tout=masked_item_seq.dtype)
+    return masked_item_seq, length
+
+
+def item_crop(aug_data, length, eta=0.6):
+    length1 = tf.cast(length, dtype=tf.float32)
+    max_length = tf.cast(get_shape_list(aug_data)[0], dtype=tf.int32)
+    embedding_size = get_shape_list(aug_data)[1]
+
+    num_left = tf.cast(tf.math.floor(length1 * eta), dtype=tf.int32)
+    crop_begin = tf.random.uniform([1], minval=0, maxval=length - num_left, dtype=tf.int32)[0]
+    cropped_item_seq = tf.zeros([get_shape_list(aug_data)[0], embedding_size])
+    cropped_item_seq = tf.where(crop_begin + num_left < max_length,
+                                tf.concat([aug_data[crop_begin:crop_begin + num_left],
+                                           cropped_item_seq[:max_length - num_left]], axis=0),
+                                tf.concat([aug_data[crop_begin:], cropped_item_seq[:crop_begin]], axis=0))
+    return cropped_item_seq, num_left
+
+
+def augment(x):
+    seq, length = x
+    flag = tf.range(2, dtype=tf.int32)
+    flag1 = tf.random.shuffle(flag)[:1][0]
+    aug_seq, aug_len = tf.cond(tf.equal(flag1, 0), lambda: item_crop(seq, length), lambda: item_mask(seq, length))
+    return [aug_seq, aug_len]
+
+
+def input_aug_data(original_data, seq_len):
+    print("seq_len:", seq_len)
+    lengths = tf.cast(seq_len, dtype=tf.int32)
+    aug_seq1, aug_len1 = tf.map_fn(augment, elems=(original_data, lengths), dtype=[tf.float32, tf.int32])
+    aug_seq2, aug_len2 = tf.map_fn(augment, elems=(original_data, lengths), dtype=[tf.float32, tf.int32])
+    aug_seq1 = tf.reshape(aug_seq1, tf.shape(original_data))
+    aug_seq2 = tf.reshape(aug_seq2, tf.shape(original_data))
+    return aug_seq1, aug_seq2, aug_len1, aug_len2
diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index bef412460..4aec1ed17 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -141,7 +141,8 @@ def __init__(self,
       model_name = model_config.WhichOneof('model')
       if model_name in {'mmoe', 'esmm', 'dbmtl', 'simple_multi_task', 'ple'}:
         model = getattr(model_config, model_name)
-        for tower in model.task_towers:
+        towers = [model.ctr_tower, model.cvr_tower] if model_name == 'esmm' else model.task_towers
+        for tower in towers:
           metrics = tower.metrics_set
           for metric in metrics:
             metric_name = metric.WhichOneof('metric')
diff --git a/easy_rec/python/layers/bst.py b/easy_rec/python/layers/bst.py
index 87e12770c..466676fd9 100644
--- a/easy_rec/python/layers/bst.py
+++ b/easy_rec/python/layers/bst.py
@@ -5,18 +5,51 @@
 from easy_rec.python.layers import multihead_cross_attention
 from easy_rec.python.utils.activation import get_activation
 from easy_rec.python.utils.shape_utils import get_shape_list
-
+from easy_rec.python.loss.nce_loss import nce_loss
+from easy_rec.python.input.augment import input_aug_data
 # from tensorflow.python.keras.layers import Layer
 
 
 class BST(object):
 
-  def __init__(self, config, l2_reg, name='din', **kwargs):
+  def __init__(self, config, l2_reg, name='bst', **kwargs):
     # super(BST, self).__init__(name=name, **kwargs)
     self.name = name
     self.l2_reg = l2_reg
     self.config = config
 
+  def encode(self, seq_input, max_position):
+    seq_fea = multihead_cross_attention.embedding_postprocessor(
+      seq_input,
+      position_embedding_name=self.name + '/position_embeddings',
+      max_position_embeddings=max_position,
+      reuse_position_embedding=tf.AUTO_REUSE)
+
+    n = tf.count_nonzero(seq_input, axis=-1)
+    seq_mask = tf.cast(n > 0, tf.int32)
+
+    attention_mask = multihead_cross_attention.create_attention_mask_from_input_mask(
+      from_tensor=seq_fea, to_mask=seq_mask)
+
+    hidden_act = get_activation(self.config.hidden_act)
+    attention_fea = multihead_cross_attention.transformer_encoder(
+      seq_fea,
+      hidden_size=self.config.hidden_size,
+      num_hidden_layers=self.config.num_hidden_layers,
+      num_attention_heads=self.config.num_attention_heads,
+      attention_mask=attention_mask,
+      intermediate_size=self.config.intermediate_size,
+      intermediate_act_fn=hidden_act,
+      hidden_dropout_prob=self.config.hidden_dropout_prob,
+      attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
+      initializer_range=self.config.initializer_range,
+      name=self.name + '/bst',
+      reuse=tf.AUTO_REUSE)
+    # attention_fea shape: [batch_size, seq_length, hidden_size]
+    out_fea = attention_fea[:, 0, :]  # target feature
+    print('bst output shape:', out_fea.shape)
+    return out_fea
+
   def __call__(self, inputs, training=None, **kwargs):
     seq_features, target_feature = inputs
     if not training:
@@ -36,58 +69,49 @@ def __call__(self, inputs, training=None, **kwargs):
     with tf.control_dependencies([valid_len]):
       # seq_input: [batch_size, seq_len, embed_size]
       seq_input = tf.concat(seq_embeds, axis=-1)
+    if target_feature is not None:
+      max_position += 1
+
+    seq_embed_size = seq_input.shape.as_list()[-1]
+    if seq_embed_size != self.config.hidden_size:
+      seq_input = tf.layers.dense(
+          seq_input,
+          self.config.hidden_size,
+          activation=tf.nn.relu,
+          kernel_regularizer=self.l2_reg)
 
     # seq_len: [batch_size, 1], the true length of each sequence
     seq_len = seq_features[0][1]
-    seq_embed_size = seq_input.shape.as_list()[-1]
+
+    if self.config.need_contrastive_learning:
+      assert 'loss_dict' in kwargs, "no `loss_dict` in kwargs of bst layer: %s" % self.name
+      loss = self.contrastive_loss(seq_input, seq_len, max_position)
+      loss *= self.config.contrastive_loss_weight
+      loss_dict = kwargs['loss_dict']
+      loss_dict['contrastive_loss'] = loss
+      tf.summary.scalar('loss/%s_contrastive_loss' % self.name, loss)
+
     if target_feature is not None:
       target_size = target_feature.shape.as_list()[-1]
       assert seq_embed_size == target_size, 'the embedding size of sequence and target item is not equal' \
                                             ' in feature group:' + self.name
+      if target_size != self.config.hidden_size:
+        target_feature = tf.layers.dense(
+          target_feature,
+          self.config.hidden_size,
+          activation=tf.nn.relu,
+          kernel_regularizer=self.l2_reg)
       # target_feature: [batch_size, 1, embed_size]
       target_feature = tf.expand_dims(target_feature, 1)
       # seq_input: [batch_size, seq_len+1, embed_size]
       seq_input = tf.concat([target_feature, seq_input], axis=1)
-      max_seq_len += 1
-      seq_len += 1
-      max_position += 1
 
-    if seq_embed_size != self.config.hidden_size:
-      seq_input = tf.layers.dense(
-          seq_input,
-          self.config.hidden_size,
-          activation=tf.nn.relu,
-          kernel_regularizer=self.l2_reg)
-
-    seq_fea = multihead_cross_attention.embedding_postprocessor(
-        seq_input,
-        position_embedding_name=self.name + '/position_embeddings',
-        max_position_embeddings=max_position)
-    seq_mask = tf.map_fn(
-        fn=lambda t: dynamic_mask(t, max_seq_len), elems=tf.to_int32(seq_len))
-    attention_mask = multihead_cross_attention.create_attention_mask_from_input_mask(
-        from_tensor=seq_fea, to_mask=seq_mask)
-
-    hidden_act = get_activation(self.config.hidden_act)
-    attention_fea = multihead_cross_attention.transformer_encoder(
-        seq_fea,
-        hidden_size=self.config.hidden_size,
-        num_hidden_layers=self.config.num_hidden_layers,
-        num_attention_heads=self.config.num_attention_heads,
-        attention_mask=attention_mask,
-        intermediate_size=self.config.intermediate_size,
-        intermediate_act_fn=hidden_act,
-        hidden_dropout_prob=self.config.hidden_dropout_prob,
-        attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
-        initializer_range=self.config.initializer_range,
-        name=self.name + '/bst')
-    # attention_fea shape: [batch_size, seq_length, hidden_size]
-    out_fea = attention_fea[:, 0, :]  # target feature
-    print('bst output shape:', out_fea.shape)
-    return out_fea
+    return self.encode(seq_input, max_position)
 
+  def contrastive_loss(self, seq_input, seq_len, max_position):
+    aug_seq1, aug_seq2, aug_len1, aug_len2 = input_aug_data(seq_input, seq_len)
+    seq_output1 = self.encode(aug_seq1, max_position)
+    seq_output2 = self.encode(aug_seq2, max_position)
+    loss = nce_loss(seq_output1, seq_output2)
+    return loss
 
-def dynamic_mask(x, max_len):
-  ones = tf.ones(shape=tf.stack([x]), dtype=tf.int32)
-  zeros = tf.zeros(shape=tf.stack([max_len - x]), dtype=tf.int32)
-  return tf.concat([ones, zeros], axis=0)
diff --git a/easy_rec/python/layers/multihead_cross_attention.py b/easy_rec/python/layers/multihead_cross_attention.py
index 92b2b64df..41bde3c5e 100644
--- a/easy_rec/python/layers/multihead_cross_attention.py
+++ b/easy_rec/python/layers/multihead_cross_attention.py
@@ -52,7 +52,8 @@ def attention_layer(from_tensor,
                     do_return_2d_tensor=False,
                     batch_size=None,
                     from_seq_length=None,
-                    to_seq_length=None):
+                    to_seq_length=None,
+                    reuse=None):
   """Performs multi-headed attention from `from_tensor` to `to_tensor`.
 
   This is an implementation of multi-headed attention based on "Attention is all you Need".
@@ -95,6 +96,7 @@ def attention_layer(from_tensor,
       of the 3D version of the `from_tensor`.
     to_seq_length: (Optional) If the input is 2D, this might be the seq length
       of the 3D version of the `to_tensor`.
+    reuse: whether to reuse this layer
 
   Returns:
     float Tensor of shape [batch_size, from_seq_length,
@@ -144,27 +146,30 @@ def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
 
   # `query_layer` = [B*F, N*H]
   query_layer = tf.layers.dense(
-      from_tensor_2d,
-      num_attention_heads * size_per_head,
-      activation=query_act,
-      name='query',
-      kernel_initializer=create_initializer(initializer_range))
+    from_tensor_2d,
+    num_attention_heads * size_per_head,
+    activation=query_act,
+    name='query',
+    kernel_initializer=create_initializer(initializer_range),
+    reuse=reuse)
 
   # `key_layer` = [B*T, N*H]
   key_layer = tf.layers.dense(
-      to_tensor_2d,
-      num_attention_heads * size_per_head,
-      activation=key_act,
-      name='key',
-      kernel_initializer=create_initializer(initializer_range))
+    to_tensor_2d,
+    num_attention_heads * size_per_head,
+    activation=key_act,
+    name='key',
+    kernel_initializer=create_initializer(initializer_range),
+    reuse=reuse)
 
   # `value_layer` = [B*T, N*H]
   value_layer = tf.layers.dense(
-      to_tensor_2d,
-      num_attention_heads * size_per_head,
-      activation=value_act,
-      name='value',
-      kernel_initializer=create_initializer(initializer_range))
+    to_tensor_2d,
+    num_attention_heads * size_per_head,
+    activation=value_act,
+    name='value',
+    kernel_initializer=create_initializer(initializer_range),
+    reuse=reuse)
 
   # `query_layer` = [B, N, F, H]
   query_layer = transpose_for_scores(query_layer, batch_size,
@@ -232,16 +237,17 @@ def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
 
 
 def transformer_encoder(input_tensor,
-                        attention_mask=None,
-                        hidden_size=768,
-                        num_hidden_layers=12,
-                        num_attention_heads=12,
-                        intermediate_size=3072,
-                        intermediate_act_fn=gelu,
-                        hidden_dropout_prob=0.1,
-                        attention_probs_dropout_prob=0.1,
-                        initializer_range=0.02,
-                        name='transformer'):
+            attention_mask=None,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            intermediate_act_fn=gelu,
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            initializer_range=0.02,
+            reuse=None,
+            name='transformer'):
   """Multi-headed, multi-layer Transformer from "Attention is All You Need".
 
   This is almost an exact implementation of the original Transformer encoder.
@@ -304,21 +310,23 @@ def transformer_encoder(input_tensor,
         with tf.variable_scope('self'):
           # [batch_size * from_seq_length, num_attention_heads * size_per_head]
           attention_output = attention_layer(
-              from_tensor=layer_input,
-              to_tensor=layer_input,
-              size_per_head=attention_head_size,
-              num_attention_heads=num_attention_heads,
-              attention_mask=attention_mask,
-              attention_probs_dropout_prob=attention_probs_dropout_prob,
-              initializer_range=initializer_range,
-              do_return_2d_tensor=True,
-              batch_size=batch_size,
-              from_seq_length=seq_length,
-              to_seq_length=seq_length)
+            from_tensor=layer_input,
+            to_tensor=layer_input,
+            size_per_head=attention_head_size,
+            num_attention_heads=num_attention_heads,
+            attention_mask=attention_mask,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            initializer_range=initializer_range,
+            do_return_2d_tensor=True,
+            batch_size=batch_size,
+            from_seq_length=seq_length,
+            to_seq_length=seq_length,
+            reuse=reuse
+          )
 
         # Run a linear projection of `hidden_size` then add a residual
         # with `layer_input`.
-        with tf.variable_scope('output'):
+        with tf.variable_scope('output', reuse=reuse):
           attention_output = tf.layers.dense(
               attention_output,
               hidden_size,
@@ -327,7 +335,7 @@ def transformer_encoder(input_tensor,
           attention_output = layer_norm(attention_output + layer_input)
 
       # The activation is only applied to the "intermediate" hidden layer.
-      with tf.variable_scope('intermediate'):
+      with tf.variable_scope('intermediate', reuse=reuse):
         intermediate_output = tf.layers.dense(
             attention_output,
             intermediate_size,
@@ -335,7 +343,7 @@ def transformer_encoder(input_tensor,
             kernel_initializer=create_initializer(initializer_range))
 
       # Down-project back to `hidden_size` then add the residual.
-      with tf.variable_scope('output'):
+      with tf.variable_scope('output', reuse=reuse):
         layer_output = tf.layers.dense(
             intermediate_output,
             hidden_size,
@@ -632,16 +640,17 @@ def create_attention_mask_from_input_mask(from_tensor, to_mask):
 
 
 def embedding_postprocessor(input_tensor,
-                            use_token_type=False,
-                            token_type_ids=None,
-                            token_type_vocab_size=16,
-                            token_type_embedding_name='token_type_embeddings',
-                            reuse_token_type=None,
-                            use_position_embeddings=True,
-                            position_embedding_name='position_embeddings',
-                            initializer_range=0.02,
-                            max_position_embeddings=512,
-                            dropout_prob=0.1):
+              use_token_type=False,
+              token_type_ids=None,
+              token_type_vocab_size=16,
+              token_type_embedding_name='token_type_embeddings',
+              reuse_token_type=None,
+              use_position_embeddings=True,
+              reuse_position_embedding=None,
+              position_embedding_name='position_embeddings',
+              initializer_range=0.02,
+              max_position_embeddings=512,
+              dropout_prob=0.1):
   """Performs various post-processing on a word embedding tensor.
 
   Args:
@@ -698,7 +707,8 @@ def embedding_postprocessor(input_tensor,
   if use_position_embeddings:
     assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
     with tf.control_dependencies([assert_op]):
-      full_position_embeddings = tf.get_variable(
+      with tf.variable_scope("position_embedding", reuse=reuse_position_embedding):
+        full_position_embeddings = tf.get_variable(
           name=position_embedding_name,
           shape=[max_position_embeddings, width],
           initializer=create_initializer(initializer_range))
diff --git a/easy_rec/python/layers/sequence_encoder.py b/easy_rec/python/layers/sequence_encoder.py
index 80c90eafa..5286215d4 100644
--- a/easy_rec/python/layers/sequence_encoder.py
+++ b/easy_rec/python/layers/sequence_encoder.py
@@ -75,7 +75,7 @@ def __call__(self, features, group_name, is_training=True, *args, **kwargs):
       encoder_type = encoder.WhichOneof('encoder').lower()
       if encoder_type == 'bst':
         bst = BST(encoder.bst, self._l2_reg, name=group_name)
-        encoding = bst([seq_features, target_feature], is_training)
+        encoding = bst([seq_features, target_feature], is_training, **kwargs)
         outputs.append(encoding)
       elif encoder_type == 'din':
         din = DIN(encoder.din, self._l2_reg, name=group_name)
diff --git a/easy_rec/python/loss/nce_loss.py b/easy_rec/python/loss/nce_loss.py
new file mode 100644
index 000000000..7613384ab
--- /dev/null
+++ b/easy_rec/python/loss/nce_loss.py
@@ -0,0 +1,34 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import tensorflow as tf
+from easy_rec.python.utils.shape_utils import get_shape_list
+
+
+def mask_samples(batch_size):
+    part = tf.ones((batch_size, batch_size), bool)
+    diag_part = tf.linalg.diag_part(part)
+    diag_part = tf.fill(tf.shape(diag_part), False)
+    part = tf.linalg.set_diag(part, diag_part)
+    part_half = tf.concat([part, part], axis=1)
+    part_total = tf.concat([part_half, part_half], axis=0)
+    return part_total
+
+
+def nce_loss(z_i, z_j, temp=1):
+    batch_size = get_shape_list(z_i)[0]
+    N = 2 * batch_size
+    z = tf.concat((z_i, z_j), axis=0)
+    sim = tf.matmul(z, tf.transpose(z)) / temp
+    sim_i_j = tf.matrix_diag_part(tf.slice(sim, [batch_size, 0], [batch_size, batch_size]))
+    sim_j_i = tf.matrix_diag_part(tf.slice(sim, [0, batch_size], [batch_size, batch_size]))
+    positive_samples = tf.reshape(tf.concat((sim_i_j, sim_j_i), axis=0), (N, 1))
+    mask = mask_samples(batch_size)
+    negative_samples = tf.reshape(tf.boolean_mask(sim, mask), (N, -1))
+
+    labels = tf.zeros(N, dtype=tf.int32)
+    logits = tf.concat((positive_samples, negative_samples), axis=1)
+
+    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits))
+
+    return loss
diff --git a/easy_rec/python/main.py b/easy_rec/python/main.py
index 1c7b82637..d74e8fe6e 100644
--- a/easy_rec/python/main.py
+++ b/easy_rec/python/main.py
@@ -610,6 +610,7 @@ def distribute_evaluate(pipeline_config,
   eval_result_file = os.path.join(model_dir, eval_result_filename)
   logging.info('save eval result to file %s' % eval_result_file)
   if cur_job_name == 'master':
+    print('eval_result = ', eval_result)
     logging.info('eval_result = {0}'.format(eval_result))
     with gfile.GFile(eval_result_file, 'w') as ofile:
       result_to_write = {'eval_method': 'distribute'}
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index e28660c45..e3cdd31ba 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -110,7 +110,7 @@ def get_sequence_encoding(self, group_name=None, is_training=True):
       if group_name in self._sequence_encoding_by_group_name:
         return self._sequence_encoding_by_group_name[group_name]
       encoding = self._sequence_encoder(self._feature_dict, group_name,
-                                        is_training)
+                                        is_training, loss_dict=self._loss_dict)
       self._sequence_encoding_by_group_name[group_name] = encoding
       return encoding
 
@@ -123,7 +123,7 @@ def get_sequence_encoding(self, group_name=None, is_training=True):
         encoding = self._sequence_encoding_by_group_name[group_name]
       else:
         encoding = self._sequence_encoder(self._feature_dict, group_name,
-                                          is_training)
+                                          is_training, loss_dict=self._loss_dict)
         self._sequence_encoding_by_group_name[group_name] = encoding
       if encoding is not None:
         seq_encoding.append(encoding)
diff --git a/easy_rec/python/model/match_model.py b/easy_rec/python/model/match_model.py
index 475ae6def..851c7eb38 100644
--- a/easy_rec/python/model/match_model.py
+++ b/easy_rec/python/model/match_model.py
@@ -174,11 +174,12 @@ def _build_point_wise_loss_graph(self):
     else:
       raise ValueError('invalid loss type: %s' % str(self._loss_type))
 
+    kwargs = {'loss_name': loss_name}
     self._loss_dict[loss_name] = loss_builder.build(
         self._loss_type,
         label=label,
         pred=pred,
-        loss_weight=self._sample_weight)
+        loss_weight=self._sample_weight, **kwargs)
 
     # build kd loss
     kd_loss_dict = loss_builder.build_kd_loss(self.kd, self._prediction_dict,
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index a5917a38d..9d565a745 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -105,6 +105,10 @@ message BSTEncoder {
     required bool use_position_embeddings = 9 [default = true];
     // The stddev of the truncated_normal_initializer for initializing all weight matrices
     required float initializer_range = 10 [default = 0.02];
+    // need contrastive learning
+    required bool need_contrastive_learning = 11 [default = false];
+    // the weight of contrastive learning loss
+    optional float contrastive_loss_weight = 12 [default = 1.0];
 }
 
 message DINEncoder {
diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py
index 57c1d79bd..cbdf95dd2 100644
--- a/easy_rec/python/test/train_eval_test.py
+++ b/easy_rec/python/test/train_eval_test.py
@@ -306,6 +306,11 @@ def test_bst(self):
         'samples/model_config/bst_on_taobao.config', self._test_dir)
     self.assertTrue(self._success)
 
+  def test_bst_contrastive_learning(self):
+    self._success = test_utils.test_single_train_eval(
+        'samples/model_config/bst_cl_on_taobao.config', self._test_dir)
+    self.assertTrue(self._success)
+
   def test_dcn(self):
     self._success = test_utils.test_single_train_eval(
         'samples/model_config/dcn_on_taobao.config', self._test_dir)
diff --git a/samples/model_config/bst_cl_on_taobao.config b/samples/model_config/bst_cl_on_taobao.config
new file mode 100644
index 000000000..77529db5e
--- /dev/null
+++ b/samples/model_config/bst_cl_on_taobao.config
@@ -0,0 +1,304 @@
+train_input_path: "data/test/tb_data/taobao_train_data"
+eval_input_path: "data/test/tb_data/taobao_test_data"
+model_dir: "experiments/dbmtl_taobao_ckpt"
+
+train_config {
+  optimizer_config {
+    adam_optimizer {
+      learning_rate {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 1e-07
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  num_steps: 100
+  sync_replicas: true
+  save_checkpoints_steps: 100
+  log_step_count_steps: 100
+}
+
+eval_config {
+  metrics_set {
+    auc {
+    }
+  }
+}
+
+data_config {
+  batch_size: 4096
+  label_fields: "clk"
+  label_fields: "buy"
+  prefetch_size: 32
+  input_type: CSVInput
+  input_fields {
+    input_name: "clk"
+    input_type: INT32
+  }
+  input_fields {
+    input_name: "buy"
+    input_type: INT32
+  }
+  input_fields {
+    input_name: "pid"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "adgroup_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "cate_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "campaign_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "customer"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "brand"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "user_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "cms_segid"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "cms_group_id"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "final_gender_code"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "age_level"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "pvalue_level"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "shopping_level"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "occupation"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "new_user_class_level"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "tag_category_list"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "tag_brand_list"
+    input_type: STRING
+  }
+  input_fields {
+    input_name: "price"
+    input_type: INT32
+  }
+}
+
+feature_config: {
+  features {
+    input_names: "pid"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "adgroup_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+  }
+  features {
+    input_names: "cate_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    embedding_name: 'category'
+  }
+  features {
+    input_names: "campaign_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+  }
+  features {
+    input_names: "customer"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+  }
+  features {
+    input_names: "brand"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+    embedding_name: 'brand'
+  }
+  features {
+    input_names: "user_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100000
+  }
+  features {
+    input_names: "cms_segid"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features {
+    input_names: "cms_group_id"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features {
+    input_names: "final_gender_code"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "age_level"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "pvalue_level"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "shopping_level"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "occupation"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features {
+    input_names: "new_user_class_level"
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 10
+  }
+  features : {
+    input_names: 'tag_category_list'
+    feature_type: SequenceFeature
+    separator: '|'
+    hash_bucket_size: 10000
+    embedding_dim: 16
+    embedding_name: 'category'
+  }
+  features : {
+    input_names: 'tag_brand_list'
+    feature_type: SequenceFeature
+    separator: '|'
+    hash_bucket_size: 100000
+    embedding_dim: 16
+    embedding_name: 'brand'
+  }
+  features {
+    input_names: "price"
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 50
+  }
+}
+
+model_config {
+  model_class: "DBMTL"
+  feature_groups {
+    group_name: "all"
+    feature_names: "user_id"
+    feature_names: "cms_segid"
+    feature_names: "cms_group_id"
+    feature_names: "age_level"
+    feature_names: "pvalue_level"
+    feature_names: "shopping_level"
+    feature_names: "occupation"
+    feature_names: "new_user_class_level"
+    feature_names: "adgroup_id"
+    feature_names: "cate_id"
+    feature_names: "campaign_id"
+    feature_names: "customer"
+    feature_names: "brand"
+    feature_names: "price"
+    feature_names: "pid"
+    wide_deep: DEEP
+  }
+
+  feature_groups {
+    group_name: "seq"
+    feature_names: "brand"
+    feature_names: "cate_id"
+    feature_names: "tag_category_list"
+    feature_names: "tag_brand_list"
+    sequence_encoders {
+      bst {
+        hidden_size: 256
+        num_attention_heads: 4
+        num_hidden_layers: 1
+        intermediate_size: 512
+        hidden_act: 'gelu'
+        max_position_embeddings: 50
+        hidden_dropout_prob: 0.1
+        attention_probs_dropout_prob: 0
+        need_contrastive_learning: true
+      }
+    }
+    wide_deep: DEEP
+  }
+
+  dbmtl {
+    bottom_dnn {
+      hidden_units: [1024, 512, 256]
+    }
+    task_towers {
+      tower_name: "ctr"
+      label_name: "clk"
+      loss_type: CLASSIFICATION
+      metrics_set: {
+        auc {}
+      }
+      dnn {
+        hidden_units: [256, 128, 64, 32]
+      }
+      relation_dnn {
+        hidden_units: [32]
+      }
+      weight: 1.0
+    }
+    l2_regularization: 1e-6
+    use_sequence_encoder: true
+  }
+  embedding_regularization: 5e-6
+}

From 381c62b1ee5705fd0f8523a10ac7b5c5486cf8fa Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Sun, 23 Apr 2023 16:30:49 +0800
Subject: [PATCH 16/54] [feat]: add more logit

---
 easy_rec/python/input/augment.py              |  25 +-
 easy_rec/python/layers/bst.py                 |  11 +-
 easy_rec/python/layers/dnn.py                 |   8 +-
 easy_rec/python/model/easy_rec_model.py       |  14 +-
 easy_rec/python/protos/easy_rec_model.proto   |   3 +
 easy_rec/python/protos/layer.proto            |   2 +
 easy_rec/python/tools/__init__.py             |   1 +
 easy_rec/python/tools/explainer/__init__.py   |   1 +
 easy_rec/python/tools/explainer/deep_shap.py  | 710 ++++++++++++++++++
 easy_rec/python/tools/explainer/explainer.py  | 506 +++++++++++++
 .../tools/explainer/feature_importance.py     |  50 ++
 easy_rec/python/tools/explainer/methods.py    | 641 ++++++++++++++++
 easy_rec/python/tools/explainer/utils.py      |  69 ++
 easy_rec/python/utils/activation.py           |  75 +-
 easy_rec/python/utils/io_util.py              |   2 +-
 15 files changed, 2044 insertions(+), 74 deletions(-)
 create mode 100644 easy_rec/python/tools/explainer/__init__.py
 create mode 100644 easy_rec/python/tools/explainer/deep_shap.py
 create mode 100644 easy_rec/python/tools/explainer/explainer.py
 create mode 100644 easy_rec/python/tools/explainer/feature_importance.py
 create mode 100644 easy_rec/python/tools/explainer/methods.py
 create mode 100644 easy_rec/python/tools/explainer/utils.py

diff --git a/easy_rec/python/input/augment.py b/easy_rec/python/input/augment.py
index 75298c430..47822c366 100644
--- a/easy_rec/python/input/augment.py
+++ b/easy_rec/python/input/augment.py
@@ -37,11 +37,32 @@ def item_crop(aug_data, length, eta=0.6):
     return cropped_item_seq, num_left
 
 
+def item_reorder(aug_data, length, beta=0.6):
+    length1 = tf.cast(length,dtype=tf.float32)
+    num_reorder = tf.cast(tf.math.floor(length1 * beta) ,dtype=tf.int32)
+    reorder_begin = tf.random.uniform([1], minval=0, maxval=length - num_reorder, dtype=tf.int32)[0]
+    shuffle_index = tf.range(reorder_begin, reorder_begin + num_reorder)
+    shuffle_index = tf.random.shuffle(shuffle_index)
+    x = tf.range(get_shape_list(aug_data)[0])
+    left = tf.slice(x, [0], [reorder_begin])
+    right = tf.slice(x, [reorder_begin + num_reorder], [-1])
+    reordered_item_index = tf.concat([left, shuffle_index, right], axis=0)
+    reordered_item_seq = tf.scatter_nd(tf.expand_dims(reordered_item_index, axis=1),
+                                       aug_data,
+                                       tf.shape(aug_data))
+    return reordered_item_seq, length
+
+
 def augment(x):
     seq, length = x
-    flag = tf.range(2, dtype=tf.int32)
+    flag = tf.range(3, dtype=tf.int32)
     flag1 = tf.random.shuffle(flag)[:1][0]
-    aug_seq, aug_len = tf.cond(tf.equal(flag1, 0), lambda: item_crop(seq, length), lambda: item_mask(seq, length))
+    aug_seq, aug_len = tf.cond(tf.equal(flag1, 0),
+                               lambda: item_crop(seq, length),
+                               lambda: tf.cond(tf.equal(flag1, 1),
+                                               lambda: item_mask(seq, length),
+                                               lambda: item_reorder(seq, length)))
+
     return [aug_seq, aug_len]
 
 
diff --git a/easy_rec/python/layers/bst.py b/easy_rec/python/layers/bst.py
index 466676fd9..c9cf7d8c9 100644
--- a/easy_rec/python/layers/bst.py
+++ b/easy_rec/python/layers/bst.py
@@ -86,10 +86,15 @@ def __call__(self, inputs, training=None, **kwargs):
     if self.config.need_contrastive_learning:
       assert 'loss_dict' in kwargs, "no `loss_dict` in kwargs of bst layer: %s" % self.name
       loss = self.contrastive_loss(seq_input, seq_len, max_position)
-      loss *= self.config.contrastive_loss_weight
+      if self.config.auto_contrastive_loss_weight:
+        uncertainty = tf.Variable(
+          0, name='%s_contrastive_loss_weight' % self.name, dtype=tf.float32)
+        loss = tf.exp(-uncertainty) * loss + 0.5 * uncertainty
+      else:
+        loss *= self.config.contrastive_loss_weight
       loss_dict = kwargs['loss_dict']
-      loss_dict['contrastive_loss'] = loss
-      tf.summary.scalar('loss/%s_contrastive_loss' % self.name, loss)
+      loss_dict['%s_contrastive_loss' % self.name] = loss
+      # tf.summary.scalar('loss/%s_contrastive_loss' % self.name, loss)
 
     if target_feature is not None:
       target_size = target_feature.shape.as_list()[-1]
diff --git a/easy_rec/python/layers/dnn.py b/easy_rec/python/layers/dnn.py
index 74e355e82..d2af5a4cf 100644
--- a/easy_rec/python/layers/dnn.py
+++ b/easy_rec/python/layers/dnn.py
@@ -34,11 +34,7 @@ def __init__(self,
     self._name = name
     self._is_training = is_training
     logging.info('dnn activation function = %s' % self._config.activation)
-    self.activations = [
-        get_activation(
-            self._config.activation, is_training=is_training, feat_dim=units)
-        for units in self.hidden_units
-    ]
+    self.activation = get_activation(self._config.activation, is_training=is_training)
     self._last_layer_no_activation = last_layer_no_activation
     self._last_layer_no_batch_norm = last_layer_no_batch_norm
 
@@ -71,7 +67,7 @@ def __call__(self, deep_fea, hidden_layer_feature_output=False):
             trainable=True,
             name='%s/dnn_%d/bn' % (self._name, i))
       if (i + 1 < hidden_units_len) or not self._last_layer_no_activation:
-        deep_fea = self.activations[i](
+        deep_fea = self.activation(
             deep_fea, name='%s/dnn_%d/act' % (self._name, i))
       if len(self.dropout_ratio) > 0 and self._is_training:
         assert self.dropout_ratio[
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index e3cdd31ba..871306326 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -17,6 +17,7 @@
 from easy_rec.python.utils import estimator_utils
 from easy_rec.python.utils import restore_filter
 from easy_rec.python.utils.load_class import get_register_class_meta
+from easy_rec.python.layers import dnn
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -129,12 +130,21 @@ def get_sequence_encoding(self, group_name=None, is_training=True):
         seq_encoding.append(encoding)
 
     if len(seq_encoding) > 1:
-      return tf.concat(seq_encoding, axis=-1)
+      encoding = tf.concat(seq_encoding, axis=-1)
     elif len(seq_encoding) == 1:
-      return seq_encoding[0]
+      encoding = seq_encoding[0]
     else:
       return None
 
+    if self._base_model_config.HasField('sequence_dnn'):
+      sequence_dnn = dnn.DNN(
+        self._base_model_config.sequence_dnn,
+        self._l2_reg,
+        name='sequence_dnn',
+        is_training=self._is_training)
+      encoding = sequence_dnn(encoding)
+    return encoding
+
   @abstractmethod
   def build_predict_graph(self):
     pass
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 27dcefadc..42f454d95 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -3,6 +3,7 @@ package protos;
 
 import "easy_rec/python/protos/fm.proto";
 import "easy_rec/python/protos/deepfm.proto";
+import "easy_rec/python/protos/dnn.proto";
 import "easy_rec/python/protos/wide_and_deep.proto";
 import "easy_rec/python/protos/multi_tower.proto";
 import "easy_rec/python/protos/dlrm.proto";
@@ -102,4 +103,6 @@ message EasyRecModel {
 
     repeated Loss losses = 15;
 
+    // dnn layers after sequence feature
+    optional DNN sequence_dnn = 16;
 }
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index 9d565a745..e2ca2e217 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -109,6 +109,8 @@ message BSTEncoder {
     required bool need_contrastive_learning = 11 [default = false];
     // the weight of contrastive learning loss
     optional float contrastive_loss_weight = 12 [default = 1.0];
+    // whether need auto learn contrastive loss weight
+    optional bool auto_contrastive_loss_weight = 13 [default = false];
 }
 
 message DINEncoder {
diff --git a/easy_rec/python/tools/__init__.py b/easy_rec/python/tools/__init__.py
index e69de29bb..d8300f4e3 100644
--- a/easy_rec/python/tools/__init__.py
+++ b/easy_rec/python/tools/__init__.py
@@ -0,0 +1 @@
+# from .explainer.explainer import create_explainer
diff --git a/easy_rec/python/tools/explainer/__init__.py b/easy_rec/python/tools/explainer/__init__.py
new file mode 100644
index 000000000..c1917b9fd
--- /dev/null
+++ b/easy_rec/python/tools/explainer/__init__.py
@@ -0,0 +1 @@
+# from .methods import DeepExplain
diff --git a/easy_rec/python/tools/explainer/deep_shap.py b/easy_rec/python/tools/explainer/deep_shap.py
new file mode 100644
index 000000000..4d0b72890
--- /dev/null
+++ b/easy_rec/python/tools/explainer/deep_shap.py
@@ -0,0 +1,710 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+import warnings
+from tensorflow.python.framework import ops as tf_ops
+from tensorflow.python.ops import gradients_impl as tf_gradients_impl
+
+if not hasattr(tf_gradients_impl, "_IsBackpropagatable"):
+  from tensorflow.python.ops import gradients_util as tf_gradients_impl
+import tensorflow as tf
+
+
+class DeepShap(object):
+  """ Meant to approximate SHAP values for deep learning models.
+
+  This is an enhanced version of the DeepLIFT algorithm (Deep SHAP) where, similar to Kernel SHAP, we
+  approximate the conditional expectations of SHAP values using a selection of background samples.
+  Lundberg and Lee, NIPS 2017 showed that the per node attribution rules in DeepLIFT (Shrikumar,
+  Greenside, and Kundaje, arXiv 2017) can be chosen to approximate Shapley values. By integrating
+  over many backgound samples Deep estimates approximate SHAP values such that they sum
+  up to the difference between the expected model output on the passed background samples and the
+  current model output (f(x) - E[f(x)]).
+  """
+
+  def __init__(self, inputs, output, data, session=None, learning_phase_flags=None):
+    """ An explainer object for a deep model using a given background dataset.
+
+    Note that the complexity of the method scales linearly with the number of background data
+    samples. Passing the entire training dataset as `data` will give very accurate expected
+    values, but be unreasonably expensive. The variance of the expectation estimates scale by
+    roughly 1/sqrt(N) for N background data samples. So 100 samples will give a good estimate,
+    and 1000 samples a very good estimate of the expected values.
+
+    Parameters
+    ----------
+    inputs : [tf.Operation]
+    output : tf.Operation
+        A pair of TensorFlow operations (or a list and an op) that
+        specifies the input and output of the model to be explained. Note that SHAP values
+        are specific to a single output value, so you get an explanation for each element of
+        the output tensor (which must be a flat rank one vector).
+
+    data : [numpy.array] or [pandas.DataFrame] or function
+        The background dataset to use for integrating out features. DeepExplainer integrates
+        over all these samples for each explanation. The data passed here must match the input
+        operations given to the model. If a function is supplied, it must be a function that
+        takes a particular input example and generates the background dataset for that example
+    session : None or tensorflow.Session
+        The TensorFlow session that has the model we are explaining. If None is passed then
+        we do our best to find the right session, first looking for a keras session, then
+        falling back to the default TensorFlow session.
+
+    learning_phase_flags : None or list of tensors
+        If you have your own custom learning phase flags pass them here. When explaining a prediction
+        we need to ensure we are not in training mode, since this changes the behavior of ops like
+        batch norm or dropout. If None is passed then we look for tensors in the graph that look like
+        learning phase flags. Note that we assume all the flags should
+        have a value of False during predictions (and hence explanations).
+
+    """
+    self.model_inputs = inputs
+    self.model_output = output
+    assert type(self.model_output) != list, "The model output to be explained must be a single tensor!"
+    assert len(self.model_output.shape) < 3, "The model output must be a vector or a single value!"
+    self.multi_output = True
+    if len(self.model_output.shape) == 1:
+      self.multi_output = False
+
+    # check if we have multiple inputs
+    self.multi_input = True
+    if type(self.model_inputs) != list or len(self.model_inputs) == 1:
+      self.multi_input = False
+      if type(self.model_inputs) != list:
+        self.model_inputs = [self.model_inputs]
+    if type(data) != list and (hasattr(data, '__call__') == False):
+      data = [data]
+    self.data = data
+
+    self._vinputs = {}  # used to track what op inputs depends on the model inputs
+    self.orig_grads = {}
+
+    if session is None:
+      try:
+        session = tf.compat.v1.keras.backend.get_session()
+      except:
+        session = tf.keras.backend.get_session()
+    self.session = tf.get_default_session() if session is None else session
+    self.graph = self.session.graph
+
+    # if no learning phase flags were given we go looking for them
+    # ...this will catch the one that keras uses
+    # we need to find them since we want to make sure learning phase flags are set to False
+    if learning_phase_flags is None:
+      self.learning_phase_ops = []
+      for op in self.graph.get_operations():
+        if 'learning_phase' in op.name and op.type == "Const" and len(op.outputs[0].shape) == 0:
+          if op.outputs[0].dtype == tf.bool:
+            self.learning_phase_ops.append(op)
+      self.learning_phase_flags = [op.outputs[0] for op in self.learning_phase_ops]
+    else:
+      self.learning_phase_ops = [t.op for t in learning_phase_flags]
+
+    # save the expected output of the model
+    # if self.data is a function, set self.expected_value to None
+    if (hasattr(self.data, '__call__')):
+      self.expected_value = None
+    else:
+      if self.data[0].shape[0] > 5000:
+        warnings.warn(
+          "You have provided over 5k background samples! For better performance consider using smaller random sample.")
+      self.expected_value = self.run(self.model_output, self.model_inputs, self.data).mean(0)
+
+    self._init_between_tensors(self.model_output.op, self.model_inputs)
+
+    # make a blank array that will get lazily filled in with the SHAP value computation
+    # graphs for each output. Lazy is important since if there are 1000 outputs and we
+    # only explain the top 5 it would be a waste to build graphs for the other 995
+    if not self.multi_output:
+      self.phi_symbolics = [None]
+    else:
+      noutputs = self.model_output.shape.as_list()[1]
+      if noutputs is not None:
+        self.phi_symbolics = [None for i in range(noutputs)]
+      else:
+        raise Exception("The model output tensor to be explained cannot have a static shape in dim 1 of None!")
+
+  def run(self, out, model_inputs, X):
+    """ Runs the model while also setting the learning phase flags to False.
+    """
+    feed_dict = dict(zip(model_inputs, X))
+    for t in self.learning_phase_flags:
+      feed_dict[t] = False
+    return self.session.run(out, feed_dict)
+
+  def phi_symbolic(self, i):
+    """ Get the SHAP value computation graph for a given model output.
+        """
+    if self.phi_symbolics[i] is None:
+      def anon():
+        out = self.model_output[:, i] if self.multi_output else self.model_output
+        return tf.gradients(out, self.model_inputs)
+
+      self.phi_symbolics[i] = self.execute_with_overridden_gradients(anon)
+
+    return self.phi_symbolics[i]
+
+  def custom_grad(self, op, *grads):
+    """ Passes a gradient op creation request to the correct handler.
+    """
+    type_name = op.type[5:] if op.type.startswith("shap_") else op.type
+    out = op_handlers[type_name](self, op, *grads)  # we cut off the shap_ prefex before the lookup
+    return out
+
+  def execute_with_overridden_gradients(self, f):
+    # replace the gradients for all the non-linear activations
+    # we do this by hacking our way into the registry (TODO: find a public API for this if it exists)
+    reg = tf_ops._gradient_registry._registry
+    ops_not_in_registry = ['TensorListReserve']
+    # NOTE: location_tag taken from tensorflow source for None type ops
+    location_tag = ("UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN")
+    # TODO: unclear why some ops are not in the registry with TF 2.0 like TensorListReserve
+    for non_reg_ops in ops_not_in_registry:
+      reg[non_reg_ops] = {'type': None, 'location': location_tag}
+    for n in op_handlers:
+      if n in reg:
+        self.orig_grads[n] = reg[n]["type"]
+        reg["shap_" + n] = {
+          "type": self.custom_grad,
+          "location": reg[n]["location"]
+        }
+        reg[n]["type"] = self.custom_grad
+
+    # In TensorFlow 1.10 they started pruning out nodes that they think can't be backpropped
+    # unfortunately that includes the index of embedding layers so we disable that check here
+    if hasattr(tf_gradients_impl, "_IsBackpropagatable"):
+      orig_IsBackpropagatable = tf_gradients_impl._IsBackpropagatable
+      tf_gradients_impl._IsBackpropagatable = lambda tensor: True
+
+    # define the computation graph for the attribution values using a custom gradient-like computation
+    try:
+      out = f()
+    finally:
+      # reinstate the backpropagatable check
+      if hasattr(tf_gradients_impl, "_IsBackpropagatable"):
+        tf_gradients_impl._IsBackpropagatable = orig_IsBackpropagatable
+
+      # restore the original gradient definitions
+      for n in op_handlers:
+        if n in reg:
+          del reg["shap_" + n]
+          reg[n]["type"] = self.orig_grads[n]
+      for non_reg_ops in ops_not_in_registry:
+        del reg[non_reg_ops]
+    return out
+
+  def shap_values(self, X, ranked_outputs=None, output_rank_order="max", check_additivity=True):
+    """ Return approximate SHAP values for the model applied to the data given by X.
+
+    Parameters
+    ----------
+    X : list, numpy.array, or pandas.DataFrame
+        A tensor (or list of tensors) of samples (where X.shape[0] == # samples) on which to
+        explain the model's output.
+
+    ranked_outputs : None or int
+        If ranked_outputs is None then we explain all the outputs in a multi-output model. If
+        ranked_outputs is a positive integer then we only explain that many of the top model
+        outputs (where "top" is determined by output_rank_order). Note that this causes a pair
+        of values to be returned (shap_values, indexes), where shap_values is a list of numpy
+        arrays for each of the output ranks, and indexes is a matrix that indicates for each sample
+        which output indexes were choses as "top".
+
+    output_rank_order : "max", "min", or "max_abs"
+        How to order the model outputs when using ranked_outputs, either by maximum, minimum, or
+        maximum absolute value.
+
+    Returns
+    -------
+    array or list
+        For a models with a single output this returns a tensor of SHAP values with the same shape
+        as X. For a model with multiple outputs this returns a list of SHAP value tensors, each of
+        which are the same shape as X. If ranked_outputs is None then this list of tensors matches
+        the number of model outputs. If ranked_outputs is a positive integer a pair is returned
+        (shap_values, indexes), where shap_values is a list of tensors with a length of
+        ranked_outputs, and indexes is a matrix that indicates for each sample which output indexes
+        were chosen as "top".
+    """
+    # check if we have multiple inputs
+    if not self.multi_input:
+      if type(X) == list and len(X) != 1:
+        assert False, "Expected a single tensor as model input!"
+      elif type(X) != list:
+        X = [X]
+    else:
+      assert type(X) == list, "Expected a list of model inputs!"
+    assert len(self.model_inputs) == len(X), "Number of model inputs (%d) does not match the number given (%d)!" % (
+      len(self.model_inputs), len(X))
+
+    # rank and determine the model outputs that we will explain
+    if ranked_outputs is not None and self.multi_output:
+      model_output_values = self.run(self.model_output, self.model_inputs, X)
+
+      if output_rank_order == "max":
+        model_output_ranks = np.argsort(-model_output_values)
+      elif output_rank_order == "min":
+        model_output_ranks = np.argsort(model_output_values)
+      elif output_rank_order == "max_abs":
+        model_output_ranks = np.argsort(np.abs(model_output_values))
+      else:
+        assert False, "output_rank_order must be max, min, or max_abs!"
+      model_output_ranks = model_output_ranks[:, :ranked_outputs]
+    else:
+      model_output_ranks = np.tile(np.arange(len(self.phi_symbolics)), (X[0].shape[0], 1))
+
+    # compute the attributions
+    output_phis = []
+    for i in range(model_output_ranks.shape[1]):
+      phis = []
+      for k in range(len(X)):
+        phis.append(np.zeros(X[k].shape))
+      for j in range(X[0].shape[0]):
+        if (hasattr(self.data, '__call__')):
+          bg_data = self.data([X[l][j] for l in range(len(X))])
+          if type(bg_data) != list:
+            bg_data = [bg_data]
+        else:
+          bg_data = self.data
+
+        # tile the inputs to line up with the background data samples
+        tiled_X = [np.tile(X[l][j:j + 1], (bg_data[l].shape[0],) + tuple([1 for k in range(len(X[l].shape) - 1)])) for l
+                   in range(len(X))]
+
+        # we use the first sample for the current sample and the rest for the references
+        joint_input = [np.concatenate([tiled_X[l], bg_data[l]], 0) for l in range(len(X))]
+
+        # run attribution computation graph
+        feature_ind = model_output_ranks[j, i]
+        sample_phis = self.run(self.phi_symbolic(feature_ind), self.model_inputs, joint_input)
+
+        # assign the attributions to the right part of the output arrays
+        for l in range(len(X)):
+          phis[l][j] = (sample_phis[l][bg_data[l].shape[0]:] * (X[l][j] - bg_data[l])).mean(0)
+
+      output_phis.append(phis[0] if not self.multi_input else phis)
+
+    # check that the SHAP values sum up to the model output
+    if check_additivity:
+      model_output = self.run(self.model_output, self.model_inputs, X)
+      for l in range(len(self.expected_value)):
+        if not self.multi_input:
+          diffs = model_output[:, l] - self.expected_value[l] - output_phis[l].sum(
+            axis=tuple(range(1, output_phis[l].ndim)))
+        else:
+          diffs = model_output[:, l] - self.expected_value[l]
+          for i in range(len(output_phis[l])):
+            diffs -= output_phis[l][i].sum(axis=tuple(range(1, output_phis[l][i].ndim)))
+        assert np.abs(
+          diffs).max() < 1e-2, "The SHAP explanations do not sum up to the model's output! This is either because of a " \
+                               "rounding error or because an operator in your computation graph was not fully supported. If " \
+                               "the sum difference of %f is significant compared the scale of your model outputs please post " \
+                               "as a github issue, with a reproducible example if possible so we can debug it." % np.abs(
+          diffs).max()
+
+    if not self.multi_output:
+      return output_phis[0]
+    elif ranked_outputs is not None:
+      return output_phis, model_output_ranks
+    else:
+      return output_phis
+
+  def _init_between_tensors(self, out_op, model_inputs):
+    # find all the operations in the graph between our inputs and outputs
+    tensor_blacklist = tensors_blocked_by_false(self.learning_phase_ops)  # don't follow learning phase branches
+    dependence_breakers = [k for k in op_handlers if op_handlers[k] == break_dependence]
+    back_ops = backward_walk_ops(
+      [out_op], tensor_blacklist,
+      dependence_breakers
+    )
+    start_ops = []
+    for minput in model_inputs:
+      for op in minput.consumers():
+        start_ops.append(op)
+    self.between_ops = forward_walk_ops(
+      start_ops,
+      tensor_blacklist, dependence_breakers,
+      within_ops=back_ops
+    )
+
+    # note all the tensors that are on the path between the inputs and the output
+    self.between_tensors = {}
+    for op in self.between_ops:
+      for t in op.outputs:
+        self.between_tensors[t.name] = True
+    for t in model_inputs:
+      self.between_tensors[t.name] = True
+
+    # save what types are being used
+    self.used_types = {}
+    for op in self.between_ops:
+      self.used_types[op.type] = True
+
+  def _variable_inputs(self, op):
+    """ Return which inputs of this operation are variable (i.e. depend on the model inputs).
+    """
+    if op not in self._vinputs:
+      out = np.zeros(len(op.inputs), dtype=np.bool)
+      for i, t in enumerate(op.inputs):
+        out[i] = t.name in self.between_tensors
+      self._vinputs[op] = out
+    return self._vinputs[op]
+
+
+def tensors_blocked_by_false(ops):
+  """ Follows a set of ops assuming their value is False and find blocked Switch paths.
+
+  This is used to prune away parts of the model graph that are only used during the training
+  phase (like dropout, batch norm, etc.).
+  """
+  blocked = []
+
+  def recurse(op):
+    if op.type == "Switch":
+      blocked.append(op.outputs[1])  # the true path is blocked since we assume the ops we trace are False
+    else:
+      for out in op.outputs:
+        for c in out.consumers():
+          recurse(c)
+
+  for op in ops:
+    recurse(op)
+
+  return blocked
+
+
+def backward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist):
+  found_ops = []
+  op_stack = [op for op in start_ops]
+  while len(op_stack) > 0:
+    op = op_stack.pop()
+    if op.type not in op_type_blacklist and op not in found_ops:
+      found_ops.append(op)
+      for input in op.inputs:
+        if input not in tensor_blacklist:
+          op_stack.append(input.op)
+  return found_ops
+
+
+def forward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist, within_ops):
+  found_ops = []
+  op_stack = [op for op in start_ops]
+  while len(op_stack) > 0:
+    op = op_stack.pop()
+    if op.type not in op_type_blacklist and op in within_ops and op not in found_ops:
+      found_ops.append(op)
+      for out in op.outputs:
+        if out not in tensor_blacklist:
+          for c in out.consumers():
+            op_stack.append(c)
+  return found_ops
+
+
+def linearity_1d_nonlinearity_2d(input_ind0, input_ind1, op_func):
+  def handler(explainer, op, *grads):
+    var = explainer._variable_inputs(op)
+    if var[input_ind0] and not var[input_ind1]:
+      return linearity_1d_handler(input_ind0, explainer, op, *grads)
+    elif var[input_ind1] and not var[input_ind0]:
+      return linearity_1d_handler(input_ind1, explainer, op, *grads)
+    elif var[input_ind0] and var[input_ind1]:
+      return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *grads)
+    else:
+      return [None for _ in op.inputs]  # no inputs vary, we must be hidden by a switch function
+
+  return handler
+
+
+def nonlinearity_1d_nonlinearity_2d(input_ind0, input_ind1, op_func):
+  def handler(explainer, op, *grads):
+    var = explainer._variable_inputs(op)
+    if var[input_ind0] and not var[input_ind1]:
+      return nonlinearity_1d_handler(input_ind0, explainer, op, *grads)
+    elif var[input_ind1] and not var[input_ind0]:
+      return nonlinearity_1d_handler(input_ind1, explainer, op, *grads)
+    elif var[input_ind0] and var[input_ind1]:
+      return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *grads)
+    else:
+      return [None for _ in op.inputs]  # no inputs vary, we must be hidden by a switch function
+
+  return handler
+
+
+def nonlinearity_1d(input_ind):
+  def handler(explainer, op, *grads):
+    return nonlinearity_1d_handler(input_ind, explainer, op, *grads)
+
+  return handler
+
+
+def nonlinearity_1d_handler(input_ind, explainer, op, *grads):
+  # make sure only the given input varies
+  op_inputs = op.inputs
+  if op_inputs is None:
+    op_inputs = op.outputs[0].op.inputs
+
+  for i in range(len(op_inputs)):
+    if i != input_ind:
+      assert not explainer._variable_inputs(op)[i], str(i) + "th input to " + op.name + " cannot vary!"
+
+  xin0, rin0 = tf.split(op_inputs[input_ind], 2)
+  xout, rout = tf.split(op.outputs[input_ind], 2)
+  delta_in0 = xin0 - rin0
+  if delta_in0.shape is None:
+    dup0 = [2, 1]
+  else:
+    dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+  out = [None for _ in op_inputs]
+  if op.type.startswith("shap_"):
+    op.type = op.type[5:]
+  orig_grad = explainer.orig_grads[op.type](op, grads[0])
+  out[input_ind] = tf.where(
+    tf.tile(tf.abs(delta_in0), dup0) < 1e-6,
+    orig_grad[input_ind] if len(op_inputs) > 1 else orig_grad,
+    grads[0] * tf.tile((xout - rout) / delta_in0, dup0)
+  )
+  return out
+
+
+def nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *grads):
+  assert input_ind0 == 0 and input_ind1 == 1, "TODO: Can't yet handle double inputs that are not first!"
+  xout, rout = tf.split(op.outputs[0], 2)
+  in0 = op.inputs[input_ind0]
+  in1 = op.inputs[input_ind1]
+  xin0, rin0 = tf.split(in0, 2)
+  xin1, rin1 = tf.split(in1, 2)
+  delta_in0 = xin0 - rin0
+  delta_in1 = xin1 - rin1
+  dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+  out10 = op_func(xin0, rin1)
+  out01 = op_func(rin0, xin1)
+  out11, out00 = xout, rout
+  out0 = 0.5 * (out11 - out01 + out10 - out00)
+  out0 = grads[0] * tf.tile(out0 / delta_in0, dup0)
+  out1 = 0.5 * (out11 - out10 + out01 - out00)
+  out1 = grads[0] * tf.tile(out1 / delta_in1, dup0)
+
+  # Avoid divide by zero nans
+  out0 = tf.where(tf.abs(tf.tile(delta_in0, dup0)) < 1e-7, tf.zeros_like(out0), out0)
+  out1 = tf.where(tf.abs(tf.tile(delta_in1, dup0)) < 1e-7, tf.zeros_like(out1), out1)
+
+  # see if due to broadcasting our gradient shapes don't match our input shapes
+  if (np.any(np.array(out1.shape) != np.array(in1.shape))):
+    broadcast_index = np.where(np.array(out1.shape) != np.array(in1.shape))[0][0]
+    out1 = tf.reduce_sum(out1, axis=broadcast_index, keepdims=True)
+  elif (np.any(np.array(out0.shape) != np.array(in0.shape))):
+    broadcast_index = np.where(np.array(out0.shape) != np.array(in0.shape))[0][0]
+    out0 = tf.reduce_sum(out0, axis=broadcast_index, keepdims=True)
+
+  return [out0, out1]
+
+
+def softmax(explainer, op, *grads):
+  """ Just decompose softmax into its components and recurse, we can handle all of them :)
+
+    We assume the 'axis' is the last dimension because the TF codebase swaps the 'axis' to
+    the last dimension before the softmax op if 'axis' is not already the last dimension.
+    We also don't subtract the max before tf.exp for numerical stability since that might
+    mess up the attributions and it seems like TensorFlow doesn't define softmax that way
+    (according to the docs)
+    """
+  in0 = op.inputs[0]
+  in0_max = tf.reduce_max(in0, axis=-1, keepdims=True, name="in0_max")
+  in0_centered = in0 - in0_max
+  evals = tf.exp(in0_centered, name="custom_exp")
+  rsum = tf.reduce_sum(evals, axis=-1, keepdims=True)
+  div = evals / rsum
+
+  # mark these as in-between the inputs and outputs
+  for op in [evals.op, rsum.op, div.op, in0_centered.op]:
+    for t in op.outputs:
+      if t.name not in explainer.between_tensors:
+        explainer.between_tensors[t.name] = False
+
+  out = tf.gradients(div, in0_centered, grad_ys=grads[0])[0]
+
+  # remove the names we just added
+  for op in [evals.op, rsum.op, div.op, in0_centered.op]:
+    for t in op.outputs:
+      if explainer.between_tensors[t.name] is False:
+        del explainer.between_tensors[t.name]
+
+  # rescale to account for our shift by in0_max (which we did for numerical stability)
+  xin0, rin0 = tf.split(in0, 2)
+  xin0_centered, rin0_centered = tf.split(in0_centered, 2)
+  delta_in0 = xin0 - rin0
+  dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+  return tf.where(
+    tf.tile(tf.abs(delta_in0), dup0) < 1e-6,
+    out,
+    out * tf.tile((xin0_centered - rin0_centered) / delta_in0, dup0)
+  )
+
+
+def maxpool(explainer, op, *grads):
+  xin0, rin0 = tf.split(op.inputs[0], 2)
+  xout, rout = tf.split(op.outputs[0], 2)
+  delta_in0 = xin0 - rin0
+  dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+  cross_max = tf.maximum(xout, rout)
+  diffs = tf.concat([cross_max - rout, xout - cross_max], 0)
+  if op.type.startswith("shap_"):
+    op.type = op.type[5:]
+  xmax_pos, rmax_pos = tf.split(explainer.orig_grads[op.type](op, grads[0] * diffs), 2)
+  return tf.tile(tf.where(
+    tf.abs(delta_in0) < 1e-7,
+    tf.zeros_like(delta_in0),
+    (xmax_pos + rmax_pos) / delta_in0
+  ), dup0)
+
+
+def gather(explainer, op, *grads):
+  # params = op.inputs[0]
+  indices = op.inputs[1]
+  # axis = op.inputs[2]
+  var = explainer._variable_inputs(op)
+  if var[1] and not var[0]:
+    assert len(indices.shape) == 2, "Only scalar indices supported right now in GatherV2!"
+
+    xin1, rin1 = tf.split(tf.cast(op.inputs[1], tf.float32), 2)
+    xout, rout = tf.split(op.outputs[0], 2)
+    dup_in1 = [2] + [1 for i in xin1.shape[1:]]
+    dup_out = [2] + [1 for i in xout.shape[1:]]
+    delta_in1_t = tf.tile(xin1 - rin1, dup_in1)
+    out_sum = tf.reduce_sum(grads[0] * tf.tile(xout - rout, dup_out),
+                            list(range(len(indices.shape), len(grads[0].shape))))
+    if op.type == "ResourceGather":
+      return [None, tf.where(
+        tf.abs(delta_in1_t) < 1e-6,
+        tf.zeros_like(delta_in1_t),
+        out_sum / delta_in1_t
+      )]
+    return [None, tf.where(
+      tf.abs(delta_in1_t) < 1e-6,
+      tf.zeros_like(delta_in1_t),
+      out_sum / delta_in1_t
+    ), None]
+  elif var[0] and not var[1]:
+    if op.type.startswith("shap_"):
+      op.type = op.type[5:]
+    return [explainer.orig_grads[op.type](op, grads[0]), None]  # linear in this case
+  else:
+    assert False, "Axis not yet supported to be varying for gather op!"
+
+
+def linearity_1d(input_ind):
+  def handler(explainer, op, *grads):
+    return linearity_1d_handler(input_ind, explainer, op, *grads)
+
+  return handler
+
+
+def linearity_1d_handler(input_ind, explainer, op, *grads):
+  # make sure only the given input varies (negative means only that input cannot vary, and is measured from the end of the list)
+  for i in range(len(op.inputs)):
+    if i != input_ind:
+      assert not explainer._variable_inputs(op)[i], str(i) + "th input to " + op.name + " cannot vary!"
+  if op.type.startswith("shap_"):
+    op.type = op.type[5:]
+  return explainer.orig_grads[op.type](op, *grads)
+
+
+def linearity_with_excluded(input_inds):
+  def handler(explainer, op, *grads):
+    return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
+
+  return handler
+
+
+def linearity_with_excluded_handler(input_inds, explainer, op, *grads):
+  # make sure the given inputs don't vary (negative is measured from the end of the list)
+  for i in range(len(op.inputs)):
+    if i in input_inds or i - len(op.inputs) in input_inds:
+      assert not explainer._variable_inputs(op)[i], str(i) + "th input to " + op.name + " cannot vary!"
+  if op.type.startswith("shap_"):
+    op.type = op.type[5:]
+  return explainer.orig_grads[op.type](op, *grads)
+
+
+def passthrough(explainer, op, *grads):
+  if op.type.startswith("shap_"):
+    op.type = op.type[5:]
+  return explainer.orig_grads[op.type](op, *grads)
+
+
+def break_dependence(explainer, op, *grads):
+  """ This function name is used to break attribution dependence in the graph traversal.
+
+  These operation types may be connected above input data values in the graph but their outputs
+  don't depend on the input values (for example they just depend on the shape).
+  """
+  return [None for _ in op.inputs]
+
+
+op_handlers = {}
+
+# ops that are always linear
+op_handlers["Identity"] = passthrough
+op_handlers["StridedSlice"] = passthrough
+op_handlers["Squeeze"] = passthrough
+op_handlers["ExpandDims"] = passthrough
+op_handlers["Pack"] = passthrough
+op_handlers["BiasAdd"] = passthrough
+op_handlers["Unpack"] = passthrough
+op_handlers["Add"] = passthrough
+op_handlers["Sub"] = passthrough
+op_handlers["Merge"] = passthrough
+op_handlers["Sum"] = passthrough
+op_handlers["Mean"] = passthrough
+op_handlers["Cast"] = passthrough
+op_handlers["Transpose"] = passthrough
+op_handlers["Enter"] = passthrough
+op_handlers["Exit"] = passthrough
+op_handlers["NextIteration"] = passthrough
+op_handlers["Tile"] = passthrough
+op_handlers["TensorArrayScatterV3"] = passthrough
+op_handlers["TensorArrayReadV3"] = passthrough
+op_handlers["TensorArrayWriteV3"] = passthrough
+
+# ops that don't pass any attributions to their inputs
+op_handlers["Shape"] = break_dependence
+op_handlers["RandomUniform"] = break_dependence
+op_handlers["ZerosLike"] = break_dependence
+# op_handlers["StopGradient"] = break_dependence # this allows us to stop attributions when we want to (like softmax re-centering)
+
+# ops that are linear and only allow a single input to vary
+op_handlers["Reshape"] = linearity_1d(0)
+op_handlers["Pad"] = linearity_1d(0)
+op_handlers["ReverseV2"] = linearity_1d(0)
+op_handlers["ConcatV2"] = linearity_with_excluded([-1])
+op_handlers["Conv2D"] = linearity_1d(0)
+op_handlers["Switch"] = linearity_1d(0)
+op_handlers["AvgPool"] = linearity_1d(0)
+op_handlers["FusedBatchNorm"] = linearity_1d(0)
+
+# ops that are nonlinear and only allow a single input to vary
+op_handlers["Relu"] = nonlinearity_1d(0)
+op_handlers["Elu"] = nonlinearity_1d(0)
+op_handlers["Sigmoid"] = nonlinearity_1d(0)
+op_handlers["Tanh"] = nonlinearity_1d(0)
+op_handlers["Softplus"] = nonlinearity_1d(0)
+op_handlers["Exp"] = nonlinearity_1d(0)
+op_handlers["ClipByValue"] = nonlinearity_1d(0)
+op_handlers["Rsqrt"] = nonlinearity_1d(0)
+op_handlers["Square"] = nonlinearity_1d(0)
+op_handlers["Max"] = nonlinearity_1d(0)
+
+# ops that are nonlinear and allow two inputs to vary
+op_handlers["SquaredDifference"] = nonlinearity_1d_nonlinearity_2d(0, 1, lambda x, y: (x - y) * (x - y))
+op_handlers["Minimum"] = nonlinearity_1d_nonlinearity_2d(0, 1, lambda x, y: tf.minimum(x, y))
+op_handlers["Maximum"] = nonlinearity_1d_nonlinearity_2d(0, 1, lambda x, y: tf.maximum(x, y))
+
+# ops that allow up to two inputs to vary are are linear when only one input varies
+op_handlers["Mul"] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x * y)
+op_handlers["RealDiv"] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x / y)
+op_handlers["MatMul"] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: tf.matmul(x, y))
+
+# ops that need their own custom attribution functions
+op_handlers["GatherV2"] = gather
+op_handlers["ResourceGather"] = gather
+op_handlers["MaxPool"] = maxpool
+op_handlers["Softmax"] = softmax
diff --git a/easy_rec/python/tools/explainer/explainer.py b/easy_rec/python/tools/explainer/explainer.py
new file mode 100644
index 000000000..a40784458
--- /dev/null
+++ b/easy_rec/python/tools/explainer/explainer.py
@@ -0,0 +1,506 @@
+import tensorflow as tf
+from tensorflow.python.platform import gfile
+from tensorflow.python.saved_model import signature_constants
+from easy_rec.python.utils.load_class import get_register_class_meta
+from easy_rec.python.utils.config_util import get_configs_from_pipeline_file
+from easy_rec.python.utils.input_utils import get_type_defaults
+from easy_rec.python.tools.explainer.methods import DeepExplain
+# from easy_rec.python.tools.explainer.deep_shap import DeepShap
+from easy_rec.python.protos.dataset_pb2 import DatasetConfig
+import abc
+import collections
+import numpy as np
+import logging
+import six
+import time
+from six import moves
+import os
+
+_EXPLAINER_CLASS_MAP = {}
+_register_abc_meta = get_register_class_meta(
+  _EXPLAINER_CLASS_MAP, have_abstract_class=True)
+
+
+class Explainer(six.with_metaclass(_register_abc_meta, object)):
+  version = 1
+
+  def __init__(self, deep_explain, model_path, method_name):
+    """Base class for explainer.
+
+    Args:
+      deep_explain: a deep explain context manager
+      model_path:  saved_model directory or frozen pb file path
+      method_name: explain method name
+    """
+    self.deep_explain = deep_explain
+    self.method = method_name
+    self._inputs_map = collections.OrderedDict()
+    self._outputs_map = collections.OrderedDict()
+    self._model_path = model_path
+    self._explainer = None
+    self._effective_fields = None
+    self._build_model()
+
+  def _build_model(self):
+    model_path = self._model_path
+    logging.info('loading model from %s' % model_path)
+    if gfile.IsDirectory(model_path):
+      assert tf.saved_model.loader.maybe_saved_model_directory(model_path), \
+        'saved model does not exists in %s' % model_path
+    else:
+      raise ValueError('currently only savedmodel is supported, path:' + model_path)
+
+    input_fields = _get_input_fields_from_pipeline_config(model_path)
+    self._input_fields_info, self._input_fields = input_fields
+
+    de = self.deep_explain
+    meta_graph_def = tf.saved_model.loader.load(
+      de.session, [tf.saved_model.tag_constants.SERVING], model_path)
+    # parse signature
+    signature_def = meta_graph_def.signature_def[
+      signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    inputs = signature_def.inputs
+    input_info = []
+    self._is_multi_placeholder = len(inputs.items()) > 1
+    if self._is_multi_placeholder:
+      for gid, item in enumerate(inputs.items()):
+        name, tensor = item
+        logging.info('Load input binding: %s -> %s' % (name, tensor.name))
+        input_name = tensor.name
+        input_name, _ = input_name.split(':')
+        try:
+          input_id = input_name.split('_')[-1]
+          input_id = int(input_id)
+        except Exception:
+          # support for models that are not exported by easy_rec
+          # in which case, the order of inputs may not be the
+          # same as they are defined, therefore, list input
+          # could not be supported, only dict input could be supported
+          logging.warning(
+            'could not determine input_id from input_name: %s' % input_name)
+          input_id = gid
+        input_info.append((input_id, name, tensor.dtype))
+        self._inputs_map[name] = de.graph.get_tensor_by_name(tensor.name)
+    else:
+      # only one input, all features concatenate together
+      for name, tensor in inputs.items():
+        logging.info('Load input binding: %s -> %s' % (name, tensor.name))
+        input_info.append((0, name, tensor.dtype))
+        self._inputs_map[name] = de.graph.get_tensor_by_name(tensor.name)
+
+    # sort inputs by input_ids so as to match the order of csv data
+    input_info.sort(key=lambda t: t[0])
+    self._input_names = [t[1] for t in input_info]
+
+    outputs = signature_def.outputs
+    for name, tensor in outputs.items():
+      logging.info('Load output binding: %s -> %s' % (name, tensor.name))
+      self._outputs_map[name] = de.graph.get_tensor_by_name(tensor.name)
+
+    # get assets
+    # self._assets = {}
+    # asset_files = tf.get_collection(constants.ASSETS_KEY)
+    # for any_proto in asset_files:
+    #   asset_file = meta_graph_pb2.AssetFileDef()
+    #   any_proto.Unpack(asset_file)
+    #   type_name = asset_file.tensor_info.name.split(':')[0]
+    #   asset_path = os.path.join(model_path, constants.ASSETS_DIRECTORY,
+    #                             asset_file.filename)
+    #   assert gfile.Exists(
+    #     asset_path), '%s is missing in saved model' % asset_path
+    #   self._assets[type_name] = asset_path
+    # logging.info(self._assets)
+
+  def default_values(self):
+    input_fields = self._input_fields if self._effective_fields is None else self._effective_fields
+    n = len(input_fields)
+    m = len(self._input_names)
+    assert m == n, 'the number input columns is not expected, %d given, %d expected\n' \
+                   'model inputs: %s\ninput fields: %s' % (n, m, ','.join(self._input_names), ','.join(input_fields))
+
+    default_value = []
+    for i, (field, name) in enumerate(zip(input_fields, self._input_names)):
+      assert field == name, "input field `%d` has different names: <%s, %s>" % (i, field, name)
+      value = self._get_defaults(field)
+      # default_value.append(np.array([value]))  # for deep_shap
+      default_value.append(np.array(value))  # for deep_shap
+    return default_value
+
+  def _get_defaults(self, col_name, col_type='string'):
+    if col_name in self._input_fields_info:
+      col_type, default_val = self._input_fields_info[col_name]
+      default_val = get_type_defaults(col_type, default_val)
+      logging.info('col_name: %s, default_val: %s' % (col_name, default_val))
+    else:
+      defaults = {'string': '', 'double': 0.0, 'bigint': 0}
+      assert col_type in defaults, 'invalid col_type: %s, col_type: %s' % (
+        col_name, col_type)
+      default_val = defaults[col_type]
+      logging.info(
+        'col_name: %s, default_val: %s.[not defined in saved_model_dir/assets/pipeline.config]'
+        % (col_name, default_val))
+    return default_val
+
+  def str_to_number(self, values):
+    assert len(values) == len(self._input_fields), "value count %d is not equal to the number of input fields %d" % (
+      len(values), len(self._input_fields)
+    )
+    result = []
+    for i, name in enumerate(self._input_names):
+      assert name in self._input_fields_info, "input `%s` not in pipeline config" % name
+      idx = self._input_fields.index(name)
+      input_type, default_val = self._input_fields_info[name]
+      if input_type in {DatasetConfig.INT32, DatasetConfig.INT64}:
+        tmp_field = int(values[idx])
+      elif input_type in [DatasetConfig.FLOAT, DatasetConfig.DOUBLE]:
+        tmp_field = float(values[idx])
+      elif input_type in [DatasetConfig.BOOL]:
+        tmp_field = values[idx].lower() in ['true', '1', 't', 'y', 'yes']
+      elif input_type in [DatasetConfig.STRING]:
+        tmp_field = values[idx]
+      else:
+        assert False, 'invalid types: %s' % str(input_type)
+      result.append(tmp_field)
+    return result
+
+  def get_explainer(self, output_cols=None):
+    if output_cols is None or output_cols == 'ALL_COLUMNS':
+      self._output_cols = sorted(self.output_names)
+      logging.info('predict output cols: %s' % self._output_cols)
+    else:
+      # specified as score float,embedding string
+      tmp_cols = []
+      for x in output_cols.split(','):
+        if x.strip() == '':
+          continue
+        tmp_keys = x.split(' ')
+        tmp_cols.append(tmp_keys[0].strip())
+      self._output_cols = tmp_cols
+    if len(self._output_cols) > 1:
+      logging.warning('Only one output can be supported currently, use the first one: %s', self._output_cols[0])
+
+    output_name = self._output_cols[0]
+    assert output_name in self.output_names, 'invalid output name `%s` not in model outputs `%s`' % (
+      output_name, ','.join(self.output_names))
+    if output_name is None:
+      output = self._outputs_map.values()[0]
+    elif type(output_name) in {str, unicode}:
+      output = self._outputs_map[output_name]
+    else:
+      raise Exception('unsupported type of output_name: ' + str(type(output_name)))
+
+    def_vals = self.default_values()
+    # print('default values (%d):' % len(def_vals), def_vals)
+    inputs = [self._inputs_map[name] for name in self._input_names]
+    # e = DeepShap(inputs, output, def_vals, session=self._session)
+    # self._explainer = e
+    e = self.deep_explain.get_explainer(self.method, output, inputs, baseline=def_vals)
+    return e
+
+  @property
+  def input_names(self):
+    """Input names of the model.
+
+    Returns:
+      a list, which conaining the name of input nodes available in model
+    """
+    return self._input_names
+
+  @property
+  def output_names(self):
+    """Output names of the model.
+
+    Returns:
+      a list, which containing the name of outputs nodes available in model
+    """
+    return list(self._outputs_map.keys())
+
+  @abc.abstractmethod
+  def feature_importance(self,
+                         input_path,
+                         output_path,
+                         reserved_cols='',
+                         output_cols=None,
+                         batch_size=1024,
+                         slice_id=0,
+                         slice_num=1):
+    pass
+
+  # def create_output_table(self, reserved_cols=''):
+  #   reserved_cols = [x.strip() for x in reserved_cols.split(',') if x != '']
+  #   outputs = self.input_names
+  #   reserved_cols = filter(lambda r: r not in outputs, reserved_cols)
+  #   output_cols = reserved_cols + outputs
+  #   sql = 'create table output_table '
+  #   return sql
+
+
+class OdpsExplainer(Explainer):
+  def feature_importance(self,
+                         input_path,
+                         output_path,
+                         reserved_cols='',
+                         output_cols=None,
+                         batch_size=1024,
+                         slice_id=0,
+                         slice_num=1):
+    input_cols = self.input_names
+    input_dim = len(input_cols)
+    if reserved_cols:
+      reserved_cols = [x.strip() for x in reserved_cols.split(',') if x.strip() not in input_cols]
+      input_cols.extend(reserved_cols)
+    selected_cols = ','.join(input_cols)
+    print("selected_cols: " + selected_cols)
+
+    explainer = self.get_explainer(output_cols)
+    print("reference value:", explainer.expected_value)
+
+    import common_io
+    reader = common_io.table.TableReader(input_path, selected_cols=selected_cols,
+                                         slice_id=slice_id, slice_count=slice_num)
+
+    reserved_cols_idx = []
+    if reserved_cols:
+      reserved_cols = [x.strip() for x in reserved_cols.split(',') if x != '']
+      schema = reader.get_schema()
+      columns = [str(x[0]) for x in schema]
+      reserved_cols_idx = [columns.index(x) for x in reserved_cols]
+      print(reserved_cols_idx)
+
+    sum_t0, sum_t1, sum_t2 = 0, 0, 0
+    writer = common_io.table.TableWriter(output_path, slice_id=slice_id)
+    total_records_num = reader.get_row_count()
+    for i in moves.range(0, total_records_num, batch_size):
+      t0 = time.time()
+      records = reader.read(batch_size, allow_smaller_final_batch=True)
+      t1 = time.time()
+      records = np.array(records)
+      inputs = list(records[:, :input_dim].T)
+      sv = explainer.shap_values(inputs, check_additivity=False)
+      outputs = [records[:, i] for i in reserved_cols_idx]
+      if outputs:
+        outputs.extend(sv[0])
+      else:
+        outputs = sv[0]
+      indices = range(len(outputs))
+      t2 = time.time()
+      writer.write(np.array(outputs).T, indices, allow_type_cast=True)
+      t3 = time.time()
+      sum_t0 += (t1 - t0)
+      sum_t1 += (t2 - t1)
+      sum_t2 += (t3 - t2)
+      if i % 100 == 0:
+        logging.info('progress: batch_num=%d sample_num=%d' %
+                     (i + 1, (i + 1) * batch_size))
+        logging.info('time_stats: read: %.2f predict: %.2f write: %.2f' %
+                     (sum_t0, sum_t1, sum_t2))
+      logging.info('Final_time_stats: read: %.2f predict: %.2f write: %.2f' %
+                   (sum_t0, sum_t1, sum_t2))
+    writer.close()
+    reader.close()
+    logging.info('Explain %s done.' % input_path)
+
+
+class OdpsRtpExplainer(Explainer):
+  def __init__(self, deep_explain, model_path, method_name):
+    super(OdpsRtpExplainer, self).__init__(deep_explain, model_path, method_name)
+    pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
+    if not gfile.Exists(pipeline_path):
+      logging.warning(
+        '%s not exists, default values maybe inconsistent with the values used in training.'
+        % pipeline_path)
+      return
+    pipeline_config = get_configs_from_pipeline_file(pipeline_path)
+    self._fg_separator = pipeline_config.data_config.separator
+
+    if pipeline_config.export_config.filter_inputs:
+      if len(pipeline_config.feature_configs) > 0:
+        feature_configs = pipeline_config.feature_configs
+      elif pipeline_config.feature_config and len(
+          pipeline_config.feature_config.features) > 0:
+        feature_configs = pipeline_config.feature_config.features
+      else:
+        assert False, 'One of feature_configs and feature_config.features must be configured.'
+
+      self._effective_fields = []
+      for fc in feature_configs:
+        for input_name in fc.input_names:
+          assert input_name in self._input_fields, 'invalid input_name in %s' % str(fc)
+          if input_name not in self._effective_fields:
+            self._effective_fields.append(input_name)
+      self._effective_fids = [
+        self._input_fields.index(x) for x in self._effective_fields
+      ]
+      # sort fids from small to large
+      self._effective_fids = list(set(self._effective_fids))
+      self._effective_fields = [
+        self._input_fields[x] for x in self._effective_fids
+      ]
+      logging.info(
+        "raw input fields: %d, effective fields: %d" % (len(self._input_fields), len(self._effective_fields)))
+
+  def feature_importance(self,
+                         input_path,
+                         output_path,
+                         reserved_cols='',
+                         output_cols=None,
+                         batch_size=1024,
+                         slice_id=0,
+                         slice_num=1):
+    input_cols = [x.strip() for x in reserved_cols.split(',') if x != '']
+    reserved_dim = len(input_cols)
+    if 'features' not in input_cols:
+      input_cols.append('features')
+    selected_cols = ','.join(input_cols)
+    print("selected_cols: " + selected_cols)
+
+    explainer = self.get_explainer(output_cols)
+    print("reference value:", explainer.expected_value)
+
+    import common_io
+    reader = common_io.table.TableReader(input_path, selected_cols=selected_cols,
+                                         slice_id=slice_id, slice_count=slice_num)
+
+    sum_t0, sum_t1, sum_t2 = 0, 0, 0
+    writer = common_io.table.TableWriter(output_path, slice_id=slice_id)
+    total_records_num = reader.get_row_count()
+    for i in moves.range(0, total_records_num, batch_size):
+      t0 = time.time()
+      records = reader.read(batch_size, allow_smaller_final_batch=True)
+      t1 = time.time()
+      inputs = []
+      reserved = []
+      for j in range(len(records)):
+        if reserved_dim > 0:
+          reserved.append(records[j][:reserved_dim])
+        inputs.append(self.str_to_number(records[j][-1].decode('utf-8').split(self._fg_separator)))
+      inputs = list(np.array(inputs).T)
+      print("inputs:", inputs)
+      # sv = explainer.shap_values(inputs, check_additivity=False)
+      ret = explainer.run(inputs, batch_size=len(records))
+      ret = np.array(ret)
+      if reserved_dim > 0:
+        outputs = np.concatenate([np.array(reserved), ret], axis=1)
+      else:
+        outputs = ret
+      indices = range(outputs.shape[1])
+      t2 = time.time()
+      writer.write(outputs.T, indices, allow_type_cast=True)
+      t3 = time.time()
+      sum_t0 += (t1 - t0)
+      sum_t1 += (t2 - t1)
+      sum_t2 += (t3 - t2)
+      if i % 2 == 0:
+        logging.info('progress: batch_num=%d sample_num=%d' %
+                     (i + 1, (i + 1) * batch_size))
+        logging.info('time_stats: read: %.2f predict: %.2f write: %.2f' %
+                     (sum_t0, sum_t1, sum_t2))
+      logging.info('Final_time_stats: read: %.2f predict: %.2f write: %.2f' %
+                   (sum_t0, sum_t1, sum_t2))
+    writer.close()
+    reader.close()
+    logging.info('Explain %s done.' % input_path)
+
+
+def _get_input_fields_from_pipeline_config(model_path):
+  pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
+  if not gfile.Exists(pipeline_path):
+    logging.warning(
+      '%s not exists, default values maybe inconsistent with the values used in training.'
+      % pipeline_path)
+    return {}, []
+  pipeline_config = get_configs_from_pipeline_file(pipeline_path)
+  data_config = pipeline_config.data_config
+  label_fields = data_config.label_fields
+  labels = {x for x in label_fields}
+  if data_config.HasField('sample_weight'):
+    labels.add(data_config.sample_weight)
+
+  input_fields = data_config.input_fields
+  input_fields_info = {
+    input_field.input_name:
+      (input_field.input_type, input_field.default_val)
+    for input_field in input_fields if input_field.input_name not in labels
+  }
+  input_fields_list = [input_field.input_name for input_field in input_fields if input_field.input_name not in labels]
+  return input_fields_info, input_fields_list
+
+
+def search_pb(directory, use_latest=False):
+  """Search pb file recursively in model directory. if multiple pb files exist, exception will be raised.
+
+  If multiple pb files exist, exception will be raised.
+
+  Args:
+    directory: model directory.
+
+  Returns:
+    directory contain pb file
+  """
+  dir_list = []
+  for root, dirs, files in gfile.Walk(directory):
+    for f in files:
+      if f.endswith('saved_model.pb'):
+        dir_list.append(root)
+  if len(dir_list) == 0:
+    raise ValueError('savedmodel is not found in directory %s' % directory)
+  elif len(dir_list) > 1:
+    if use_latest:
+      logging.info('find %d models: %s' % (len(dir_list), ','.join(dir_list)))
+      dir_list = sorted(
+        dir_list,
+        key=lambda x: int(x.split('/')[(-2 if (x[-1] == '/') else -1)]))
+      return dir_list[-1]
+    else:
+      raise ValueError('multiple saved model found in directory %s' %
+                       directory)
+
+  return dir_list[0]
+
+
+# def create_explainer(model_path, use_latest=False):
+#   if gfile.IsDirectory(model_path):
+#     model_path = search_pb(model_path, use_latest)
+#   else:
+#     raise ValueError('model_path should be a directory, path:' + model_path)
+#   pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
+#   if not gfile.Exists(pipeline_path):
+#     logging.warning('%s not exists' % pipeline_path)
+#     raise ValueError('%s not exists' % pipeline_path)
+#
+#   pipeline_config = get_configs_from_pipeline_file(pipeline_path)
+#   input_type = pipeline_config.data_config.input_type
+#   if input_type in {DatasetConfig.OdpsInput, DatasetConfig.OdpsInputV2, DatasetConfig.OdpsInputV3}:
+#     return OdpsExplainer(model_path)
+#   if input_type in {DatasetConfig.OdpsRTPInput, DatasetConfig.OdpsRTPInputV2}:
+#     return OdpsRtpExplainer(model_path)
+#   raise ValueError("currently unsupported input type: " + input_type)
+
+
+def run(FLAGS):
+  model_path = FLAGS.saved_model_dir
+  if gfile.IsDirectory(model_path):
+    model_path = search_pb(model_path, False)
+  else:
+    raise ValueError('model_path should be a directory, path:' + model_path)
+  pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
+  if not gfile.Exists(pipeline_path):
+    logging.warning('%s not exists' % pipeline_path)
+    raise ValueError('%s not exists' % pipeline_path)
+
+  gpu_options = tf.GPUOptions(allow_growth=True)
+  session_config = tf.ConfigProto(
+    gpu_options=gpu_options,
+    allow_soft_placement=True)
+  session = tf.Session(config=session_config)
+
+  worker_count = len(FLAGS.worker_hosts.split(','))
+  with DeepExplain(session=session) as de:
+    e = OdpsRtpExplainer(de, model_path, 'deeplift')
+    e.feature_importance(FLAGS.explain_tables if FLAGS.explain_tables else FLAGS.tables,
+                         FLAGS.outputs,
+                         reserved_cols=FLAGS.reserved_cols,
+                         output_cols=FLAGS.output_cols,
+                         batch_size=FLAGS.batch_size,
+                         slice_id=FLAGS.task_index,
+                         slice_num=worker_count)
diff --git a/easy_rec/python/tools/explainer/feature_importance.py b/easy_rec/python/tools/explainer/feature_importance.py
new file mode 100644
index 000000000..034f3c0da
--- /dev/null
+++ b/easy_rec/python/tools/explainer/feature_importance.py
@@ -0,0 +1,50 @@
+from __future__ import print_function
+from easy_rec.python.tools.explainer.explainer import run
+import tensorflow as tf
+flags = tf.app.flags
+
+flags.DEFINE_string('saved_model_dir', '', 'directory where saved_model.pb exists')
+flags.DEFINE_string('explain_tables', '', 'tables used for explaination')
+flags.DEFINE_string('background_table', '', 'tables used for expected value')
+flags.DEFINE_string('tables', '', 'tables passed by pai command')
+flags.DEFINE_string('outputs', '', 'output tables')
+flags.DEFINE_string(
+    'selected_cols', '',
+    'columns to keep from input table,  they are separated with ,')
+flags.DEFINE_string(
+    'reserved_cols', '',
+    'columns to keep from input table,  they are separated with ,')
+flags.DEFINE_string(
+    'output_cols', None,
+    'output columns, such as: score float. multiple columns are separated by ,')
+flags.DEFINE_integer('batch_size', 1024, 'predict batch size')
+flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of hostname:port pairs')
+flags.DEFINE_integer('task_index', 0, 'Index of task within the job')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  for k in FLAGS:
+    if k in ('h', 'help', 'helpshort', 'helpfull'):
+      continue
+    print("%s=%s" % (k, FLAGS[k].value))
+
+  # worker_count = len(FLAGS.worker_hosts.split(','))
+  # e = create_explainer(FLAGS.saved_model_dir)
+  #
+  # output_names = e.input_names
+  # print("feature_names:", output_names)
+  # print("feature_num:", len(output_names))
+  # e.feature_importance(FLAGS.explain_tables if FLAGS.explain_tables else FLAGS.tables,
+  #                      FLAGS.outputs,
+  #                      reserved_cols=FLAGS.reserved_cols,
+  #                      output_cols=FLAGS.output_cols,
+  #                      batch_size=FLAGS.batch_size,
+  #                      slice_id=FLAGS.task_index,
+  #                      slice_num=worker_count)
+  run(FLAGS)
+
+
+if __name__ == '__main__':
+  tf.app.run(main=main)
diff --git a/easy_rec/python/tools/explainer/methods.py b/easy_rec/python/tools/explainer/methods.py
new file mode 100644
index 000000000..aa7192acc
--- /dev/null
+++ b/easy_rec/python/tools/explainer/methods.py
@@ -0,0 +1,641 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import numpy as np
+from skimage.util import view_as_windows
+import warnings, logging
+import tensorflow as tf
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn_grad, math_grad
+from collections import OrderedDict
+from easy_rec.python.tools.explainer.utils import make_batches, slice_arrays, to_list, unpack_singleton
+
+SUPPORTED_ACTIVATIONS = [
+    'Relu', 'Elu', 'Sigmoid', 'Tanh', 'Softplus'
+]
+
+UNSUPPORTED_ACTIVATIONS = [
+    'CRelu', 'Relu6', 'Softsign'
+]
+
+_ENABLED_METHOD_CLASS = None
+_GRAD_OVERRIDE_CHECKFLAG = 0
+
+
+# -----------------------------------------------------------------------------
+# UTILITY FUNCTIONS
+# -----------------------------------------------------------------------------
+
+
+def activation(type):
+    """
+    Returns Tensorflow's activation op, given its type
+    :param type: string
+    :return: op
+    """
+    if type not in SUPPORTED_ACTIVATIONS:
+        warnings.warn('Activation function (%s) not supported' % type)
+    f = getattr(tf.nn, type.lower())
+    return f
+
+
+def original_grad(op, grad):
+    """
+    Return original Tensorflow gradient for an op
+    :param op: op
+    :param grad: Tensor
+    :return: Tensor
+    """
+    if op.type not in SUPPORTED_ACTIVATIONS:
+        warnings.warn('Activation function (%s) not supported' % op.type)
+    opname = '_%sGrad' % op.type
+    if hasattr(nn_grad, opname):
+        f = getattr(nn_grad, opname)
+    else:
+        f = getattr(math_grad, opname)
+    return f(op, grad)
+
+
+# -----------------------------------------------------------------------------
+# ATTRIBUTION METHODS BASE CLASSES
+# -----------------------------------------------------------------------------
+
+
+class AttributionMethod(object):
+    """
+    Attribution method base class
+    """
+    def __init__(self, T, X, session, keras_learning_phase=None):
+        self.T = T  # target Tensor
+        self.X = X  # input Tensor
+        self.Y_shape = [None,] + T.get_shape().as_list()[1:]
+        # Most often T contains multiple output units. In this case, it is often necessary to select
+        # a single unit to compute contributions for. This can be achieved passing 'ys' as weight for the output Tensor.
+        self.Y = tf.placeholder(tf.float32, self.Y_shape)
+        # placeholder_from_data(ys) if ys is not None else 1.0  # Tensor that represents weights for T
+        self.T = self.T * self.Y
+        self.symbolic_attribution = None
+        self.session = session
+        self.keras_learning_phase = keras_learning_phase
+        self.has_multiple_inputs = type(self.X) is list or type(self.X) is tuple
+        logging.info('Model with multiple inputs: %s' % self.has_multiple_inputs)
+
+        # Set baseline
+        # TODO: now this sets a baseline also for those methods that does not require it
+        self._set_check_baseline()
+
+        # References
+        self._init_references()
+
+        # Create symbolic explanation once during construction (affects only gradient-based methods)
+        self.explain_symbolic()
+
+    def explain_symbolic(self):
+        return None
+
+    def run(self, xs, ys=None, batch_size=None):
+        pass
+
+    def _init_references(self):
+        pass
+
+    def _check_input_compatibility(self, xs, ys=None, batch_size=None):
+        if ys is not None:
+            if not self.has_multiple_inputs and len(xs) != len(ys):
+                raise RuntimeError('When provided, ys must have the same batch size as xs (xs has batch size {} and ys {})'.format(len(xs), len(ys)))
+            elif self.has_multiple_inputs and np.all([len(i) != len(ys) for i in xs]):
+                raise RuntimeError('When provided, ys must have the same batch size as all elements of xs')
+        if batch_size is not None and batch_size > 0:
+            if self.T.shape[0].value is not None and self.T.shape[0].value is not batch_size:
+                raise RuntimeError('When using batch evaluation, the first dimension of the target tensor '
+                                   'must be compatible with the batch size. Found %s instead' % self.T.shape[0].value)
+            if isinstance(self.X, list):
+                for x in self.X:
+                    if x.shape[0].value is not None and x.shape[0].value is not batch_size:
+                        raise RuntimeError('When using batch evaluation, the first dimension of the input tensor '
+                                           'must be compatible with the batch size. Found %s instead' % x.shape[
+                                               0].value)
+            else:
+                if self.X.shape[0].value is not None and self.X.shape[0].value is not batch_size:
+                    raise RuntimeError('When using batch evaluation, the first dimension of the input tensor '
+                                       'must be compatible with the batch size. Found %s instead' % self.X.shape[0].value)
+
+    def _session_run_batch(self, T, xs, ys=None):
+        feed_dict = {}
+        if self.has_multiple_inputs:
+            for k, v in zip(self.X, xs):
+                feed_dict[k] = v
+        else:
+            feed_dict[self.X] = xs
+
+        # If ys is not passed, produce a vector of ones that will be broadcasted to all batch samples
+        feed_dict[self.Y] = ys if ys is not None else np.ones([1,] + self.Y_shape[1:])
+
+        if self.keras_learning_phase is not None:
+            feed_dict[self.keras_learning_phase] = 0
+        return self.session.run(T, feed_dict)
+
+    def _session_run(self, T, xs, ys=None, batch_size=None):
+        num_samples = len(xs)
+        if self.has_multiple_inputs is True:
+            num_samples = len(xs[0])
+            if len(xs) != len(self.X):
+                raise RuntimeError('List of input tensors and input data have different lengths (%s and %s)'
+                                   % (str(len(xs)), str(len(self.X))))
+            if batch_size is not None:
+                for xi in xs:
+                    if len(xi) != num_samples:
+                        raise RuntimeError('Evaluation in batches requires all inputs to have '
+                                           'the same number of samples')
+
+        if batch_size is None or batch_size <= 0 or num_samples <= batch_size:
+            return self._session_run_batch(T, xs, ys)
+        else:
+            outs = []
+            batches = make_batches(num_samples, batch_size)
+            for batch_index, (batch_start, batch_end) in enumerate(batches):
+                # Get a batch from data
+                xs_batch = slice_arrays(xs, batch_start, batch_end)
+                # If the target tensor has one entry for each sample, we need to batch it as well
+                ys_batch = None
+                if ys is not None:
+                    ys_batch = slice_arrays(ys, batch_start, batch_end)
+                batch_outs = self._session_run_batch(T, xs_batch, ys_batch)
+                batch_outs = to_list(batch_outs)
+                if batch_index == 0:
+                    # Pre-allocate the results arrays.
+                    for batch_out in batch_outs:
+                        shape = (num_samples,) + batch_out.shape[1:]
+                        outs.append(np.zeros(shape, dtype=batch_out.dtype))
+                for i, batch_out in enumerate(batch_outs):
+                    outs[i][batch_start:batch_end] = batch_out
+            return unpack_singleton(outs)
+
+    def _set_check_baseline(self):
+        # Do nothing for those methods that have no baseline required
+        if not hasattr(self, "baseline"):
+            return
+
+        if self.baseline is None:
+            if self.has_multiple_inputs:
+                self.baseline = [np.zeros([1,] + xi.get_shape().as_list()[1:]) for xi in self.X]
+            else:
+                self.baseline = np.zeros([1,] + self.X.get_shape().as_list()[1:])
+
+        else:
+            if self.has_multiple_inputs:
+                for i, xi in enumerate(self.X):
+                    if list(self.baseline[i].shape) == xi.get_shape().as_list()[1:]:
+                        self.baseline[i] = np.expand_dims(self.baseline[i], 0)
+                    else:
+                        raise RuntimeError('Baseline shape %s does not match expected shape %s'
+                                           % (self.baseline[i].shape, xi.get_shape().as_list()[1:]))
+            else:
+                if list(self.baseline.shape) == self.X.get_shape().as_list()[1:]:
+                    self.baseline = np.expand_dims(self.baseline, 0)
+                else:
+                    raise RuntimeError('Baseline shape %s does not match expected shape %s'
+                                       % (self.baseline.shape, self.X.get_shape().as_list()[1:]))
+
+
+class GradientBasedMethod(AttributionMethod):
+    """
+    Base class for gradient-based attribution methods
+    """
+    def get_symbolic_attribution(self):
+        return tf.gradients(self.T, self.X)
+
+    def explain_symbolic(self):
+        if self.symbolic_attribution is None:
+            self.symbolic_attribution = self.get_symbolic_attribution()
+        return self.symbolic_attribution
+
+    def run(self, xs, ys=None, batch_size=None):
+        self._check_input_compatibility(xs, ys, batch_size)
+        results = self._session_run(self.explain_symbolic(), xs, ys, batch_size)
+        return results[0] if not self.has_multiple_inputs else results
+
+    @classmethod
+    def nonlinearity_grad_override(cls, op, grad):
+        return original_grad(op, grad)
+
+
+class PerturbationBasedMethod(AttributionMethod):
+    """
+       Base class for perturbation-based attribution methods
+       """
+    def __init__(self, T, X, session, keras_learning_phase):
+        super(PerturbationBasedMethod, self).__init__(T, X, session, keras_learning_phase)
+        self.base_activation = None
+
+
+
+# -----------------------------------------------------------------------------
+# ATTRIBUTION METHODS
+# -----------------------------------------------------------------------------
+"""
+Returns zero attributions. For testing only.
+"""
+
+
+class DummyZero(GradientBasedMethod):
+
+    def get_symbolic_attribution(self,):
+        return tf.gradients(self.T, self.X)
+
+    @classmethod
+    def nonlinearity_grad_override(cls, op, grad):
+        input = op.inputs[0]
+        return tf.zeros_like(input)
+
+"""
+Saliency maps
+https://arxiv.org/abs/1312.6034
+"""
+
+
+class Saliency(GradientBasedMethod):
+
+    def get_symbolic_attribution(self):
+        return [tf.abs(g) for g in tf.gradients(self.T, self.X)]
+
+
+"""
+Gradient * Input
+https://arxiv.org/pdf/1704.02685.pdf - https://arxiv.org/abs/1611.07270
+"""
+
+
+class GradientXInput(GradientBasedMethod):
+
+    def get_symbolic_attribution(self):
+        return [g * x for g, x in zip(
+            tf.gradients(self.T, self.X),
+            self.X if self.has_multiple_inputs else [self.X])]
+
+
+"""
+Integrated Gradients
+https://arxiv.org/pdf/1703.01365.pdf
+"""
+
+
+class IntegratedGradients(GradientBasedMethod):
+
+    def __init__(self, T, X, session, keras_learning_phase, steps=100, baseline=None):
+        self.steps = steps
+        self.baseline = baseline
+        super(IntegratedGradients, self).__init__(T, X, session, keras_learning_phase)
+
+    def run(self, xs, ys=None, batch_size=None):
+        self._check_input_compatibility(xs, ys, batch_size)
+
+        gradient = None
+        for alpha in list(np.linspace(1. / self.steps, 1.0, self.steps)):
+            xs_mod = [b + (x - b) * alpha for x, b in zip(xs, self.baseline)] if self.has_multiple_inputs \
+                else self.baseline + (xs - self.baseline) * alpha
+            _attr = self._session_run(self.explain_symbolic(), xs_mod, ys, batch_size)
+            if gradient is None: gradient = _attr
+            else: gradient = [g + a for g, a in zip(gradient, _attr)]
+
+        results = [g * (x - b) / self.steps for g, x, b in zip(
+            gradient,
+            xs if self.has_multiple_inputs else [xs],
+            self.baseline if self.has_multiple_inputs else [self.baseline])]
+
+        return results[0] if not self.has_multiple_inputs else results
+
+
+"""
+Layer-wise Relevance Propagation with epsilon rule
+http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0130140
+"""
+
+
+class EpsilonLRP(GradientBasedMethod):
+    eps = None
+
+    def __init__(self, T, X, session, keras_learning_phase, epsilon=1e-4):
+        assert epsilon > 0.0, 'LRP epsilon must be greater than zero'
+        global eps
+        eps = epsilon
+        super(EpsilonLRP, self).__init__(T, X, session, keras_learning_phase)
+
+    def get_symbolic_attribution(self):
+        return [g * x for g, x in zip(
+            tf.gradients(self.T, self.X),
+            self.X if self.has_multiple_inputs else [self.X])]
+
+    @classmethod
+    def nonlinearity_grad_override(cls, op, grad):
+        output = op.outputs[0]
+        input = op.inputs[0]
+        return grad * output / (input + eps *
+                                tf.where(input >= 0, tf.ones_like(input), -1 * tf.ones_like(input)))
+
+"""
+DeepLIFT
+This reformulation only considers the "Rescale" rule
+https://arxiv.org/abs/1704.02685
+"""
+
+
+class DeepLIFTRescale(GradientBasedMethod):
+
+    _deeplift_ref = {}
+
+    def __init__(self, T, X, session, keras_learning_phase, baseline=None):
+        self.baseline = baseline
+        super(DeepLIFTRescale, self).__init__(T, X, session, keras_learning_phase)
+
+    def get_symbolic_attribution(self):
+        return [g * (x - b) for g, x, b in zip(
+            tf.gradients(self.T, self.X),
+            self.X if self.has_multiple_inputs else [self.X],
+            self.baseline if self.has_multiple_inputs else [self.baseline])]
+
+    @classmethod
+    def nonlinearity_grad_override(cls, op, grad):
+        output = op.outputs[0]
+        input = op.inputs[0]
+        ref_input = cls._deeplift_ref[op.name]
+        ref_output = activation(op.type)(ref_input)
+        delta_out = output - ref_output
+        delta_in = input - ref_input
+        instant_grad = activation(op.type)(0.5 * (ref_input + input))
+        return tf.where(tf.abs(delta_in) > 1e-5, grad * delta_out / delta_in,
+                        original_grad(instant_grad.op, grad))
+
+    def _init_references(self):
+        # print ('DeepLIFT: computing references...')
+        sys.stdout.flush()
+        self._deeplift_ref.clear()
+        ops = []
+        g = tf.get_default_graph()
+        for op in g.get_operations():
+            if len(op.inputs) > 0 and not op.name.startswith('gradients'):
+                if op.type in SUPPORTED_ACTIVATIONS:
+                    ops.append(op)
+        YR = self._session_run([o.inputs[0] for o in ops], self.baseline)
+        for (r, op) in zip(YR, ops):
+            self._deeplift_ref[op.name] = r
+        # print('DeepLIFT: references ready')
+        sys.stdout.flush()
+
+
+"""
+Occlusion method
+Generalization of the grey-box method presented in https://arxiv.org/pdf/1311.2901.pdf
+This method performs a systematic perturbation of contiguous hyperpatches in the input,
+replacing each patch with a user-defined value (by default 0).
+window_shape : integer or tuple of length xs_ndim
+Defines the shape of the elementary n-dimensional orthotope the rolling window view.
+If an integer is given, the shape will be a hypercube of sidelength given by its value.
+step : integer or tuple of length xs_ndim
+Indicates step size at which extraction shall be performed.
+If integer is given, then the step is uniform in all dimensions.
+"""
+
+
+class Occlusion(PerturbationBasedMethod):
+
+    def __init__(self, T, X, session, keras_learning_phase, window_shape=None, step=None):
+        super(Occlusion, self).__init__(T, X, session, keras_learning_phase)
+        if self.has_multiple_inputs:
+            raise RuntimeError('Multiple inputs not yet supported for perturbation methods')
+
+        input_shape = X[0].get_shape().as_list()
+        if window_shape is not None:
+            assert len(window_shape) == len(input_shape), \
+                'window_shape must have length of input (%d)' % len(input_shape)
+            self.window_shape = tuple(window_shape)
+        else:
+            self.window_shape = (1,) * len(input_shape)
+
+        if step is not None:
+            assert isinstance(step, int) or len(step) == len(input_shape), \
+                'step must be integer or tuple with the length of input (%d)' % len(input_shape)
+            self.step = step
+        else:
+            self.step = 1
+        self.replace_value = 0.0
+        logging.info('Input shape: %s; window_shape %s; step %s' % (input_shape, self.window_shape, self.step))
+
+    def run(self, xs, ys=None, batch_size=None):
+        self._check_input_compatibility(xs, ys, batch_size)
+        input_shape = xs.shape[1:]
+        batch_size = xs.shape[0]
+        total_dim = np.asscalar(np.prod(input_shape))
+
+        # Create mask
+        index_matrix = np.arange(total_dim).reshape(input_shape)
+        idx_patches = view_as_windows(index_matrix, self.window_shape, self.step).reshape((-1,) + self.window_shape)
+        heatmap = np.zeros_like(xs, dtype=np.float32).reshape((-1), total_dim)
+        w = np.zeros_like(heatmap)
+
+        # Compute original output
+        eval0 = self._session_run(self.T, xs, ys, batch_size)
+
+        # Start perturbation loop
+        for i, p in enumerate(idx_patches):
+            mask = np.ones(input_shape).flatten()
+            mask[p.flatten()] = self.replace_value
+            masked_xs = mask.reshape((1,) + input_shape) * xs
+            delta = eval0 - self._session_run(self.T, masked_xs, ys, batch_size)
+            delta_aggregated = np.sum(delta.reshape((batch_size, -1)), -1, keepdims=True)
+            heatmap[:, p.flatten()] += delta_aggregated
+            w[:, p.flatten()] += p.size
+
+        attribution = np.reshape(heatmap / w, xs.shape)
+        if np.isnan(attribution).any():
+            warnings.warn('Attributions generated by Occlusion method contain nans, '
+                          'probably because window_shape and step do not allow to cover the all input.')
+        return attribution
+
+
+"""
+Shapley Value sampling
+Computes approximate Shapley Values using "Polynomial calculation of the Shapley value based on sampling",
+Castro et al, 2009 (https://www.sciencedirect.com/science/article/pii/S0305054808000804)
+samples : integer (default 5)
+Defined the number of samples for each input feature. 
+Notice that evaluating a model samples * n_input_feature times might take a while.
+sampling_dims : list of dimension indexes to run sampling on (feature dimensions).
+By default, all dimensions except the batch dimension will be sampled.
+For example, with a 4-D tensor that contains color images, single color channels are sampled.
+To sample pixels, instead, use sampling_dims=[1,2]
+"""
+
+
+class ShapleySampling(PerturbationBasedMethod):
+
+    def __init__(self, T, X, session, keras_learning_phase, samples=5, sampling_dims=None):
+        super(ShapleySampling, self).__init__(T, X, session, keras_learning_phase)
+        if self.has_multiple_inputs:
+            raise RuntimeError('Multiple inputs not yet supported for perturbation methods')
+        dims = len(X.shape)
+        if sampling_dims is not None:
+            if not 0 < len(sampling_dims) <= (dims - 1):
+                raise RuntimeError('sampling_dims must be a list containing 1 to %d elements' % (dims-1))
+            if 0 in sampling_dims:
+                raise RuntimeError('Cannot sample batch dimension: remove 0 from sampling_dims')
+            if any([x < 1 or x > dims-1 for x in sampling_dims]):
+                raise RuntimeError('Invalid value in sampling_dims')
+        else:
+            sampling_dims = list(range(1, dims))
+
+        self.samples = samples
+        self.sampling_dims = sampling_dims
+
+    def run(self, xs, ys=None, batch_size=None):
+        xs_shape = list(xs.shape)
+        batch_size = xs.shape[0]
+        n_features = int(np.asscalar(np.prod([xs.shape[i] for i in self.sampling_dims])))
+        result = np.zeros((xs_shape[0], n_features))
+
+        run_shape = list(xs_shape)  # a copy
+        run_shape = np.delete(run_shape, self.sampling_dims).tolist()
+        run_shape.insert(1, -1)
+
+        reconstruction_shape = [xs_shape[0]]
+        for j in self.sampling_dims:
+            reconstruction_shape.append(xs_shape[j])
+
+        for r in range(self.samples):
+            p = np.random.permutation(n_features)
+            x = xs.copy().reshape(run_shape)
+            y = None
+            for i in p:
+                if y is None:
+                    y = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
+                x[:, i] = 0
+                y0 = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
+                delta = y - y0
+                delta_aggregated = np.sum(delta.reshape((batch_size, -1)), -1, keepdims=False)
+                result[:, i] += delta_aggregated
+                y = y0
+
+        shapley = result / self.samples
+        return shapley.reshape(reconstruction_shape)
+
+
+# -----------------------------------------------------------------------------
+# END ATTRIBUTION METHODS
+# -----------------------------------------------------------------------------
+
+
+attribution_methods = OrderedDict({
+    'zero': (DummyZero, 0),
+    'saliency': (Saliency, 1),
+    'grad*input': (GradientXInput, 2),
+    'intgrad': (IntegratedGradients, 3),
+    'elrp': (EpsilonLRP, 4),
+    'deeplift': (DeepLIFTRescale, 5),
+    'occlusion': (Occlusion, 6),
+    'shapley_sampling': (ShapleySampling, 7)
+})
+
+
+
+@ops.RegisterGradient("DeepExplainGrad")
+def deepexplain_grad(op, grad):
+    global _ENABLED_METHOD_CLASS, _GRAD_OVERRIDE_CHECKFLAG
+    _GRAD_OVERRIDE_CHECKFLAG = 1
+    if _ENABLED_METHOD_CLASS is not None \
+            and issubclass(_ENABLED_METHOD_CLASS, GradientBasedMethod):
+        return _ENABLED_METHOD_CLASS.nonlinearity_grad_override(op, grad)
+    else:
+        return original_grad(op, grad)
+
+
+class DeepExplain(object):
+
+    def __init__(self, graph=None, session=tf.get_default_session()):
+        self.method = None
+        self.batch_size = None
+        self.session = session
+        self.graph = session.graph if graph is None else graph
+        self.graph_context = self.graph.as_default()
+        self.override_context = self.graph.gradient_override_map(self.get_override_map())
+        self.keras_phase_placeholder = None
+        self.context_on = False
+        if self.session is None:
+            raise RuntimeError('DeepExplain: could not retrieve a session. Use DeepExplain(session=your_session).')
+
+    def __enter__(self):
+        # Override gradient of all ops created in context
+        self.graph_context.__enter__()
+        self.override_context.__enter__()
+        self.context_on = True
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.graph_context.__exit__(type, value, traceback)
+        self.override_context.__exit__(type, value, traceback)
+        self.context_on = False
+
+    def get_explainer(self, method, T, X, **kwargs):
+        if not self.context_on:
+            raise RuntimeError('Explain can be called only within a DeepExplain context.')
+        global _ENABLED_METHOD_CLASS, _GRAD_OVERRIDE_CHECKFLAG
+        self.method = method
+        if self.method in attribution_methods:
+            method_class, method_flag = attribution_methods[self.method]
+        else:
+            raise RuntimeError('Method must be in %s' % list(attribution_methods.keys()))
+        if isinstance(X, list):
+            for x in X:
+                if 'tensor' not in str(type(x)).lower():
+                    raise RuntimeError('If a list, X must contain only Tensorflow Tensor objects')
+        else:
+            if 'tensor' not in str(type(X)).lower():
+                raise RuntimeError('X must be a Tensorflow Tensor object or a list of them')
+
+        if 'tensor' not in str(type(T)).lower():
+            raise RuntimeError('T must be a Tensorflow Tensor object')
+
+        logging.info('DeepExplain: running "%s" explanation method (%d)' % (self.method, method_flag))
+        self._check_ops()
+        _GRAD_OVERRIDE_CHECKFLAG = 0
+
+        _ENABLED_METHOD_CLASS = method_class
+        method = _ENABLED_METHOD_CLASS(T, X,
+                                       self.session,
+                                       keras_learning_phase=self.keras_phase_placeholder,
+                                       **kwargs)
+
+        if issubclass(_ENABLED_METHOD_CLASS, GradientBasedMethod) and _GRAD_OVERRIDE_CHECKFLAG == 0:
+            warnings.warn('DeepExplain detected you are trying to use an attribution method that requires '
+                          'gradient override but the original gradient was used instead. You might have forgot to '
+                          '(re)create your graph within the DeepExlain context. Results are not reliable!')
+        _ENABLED_METHOD_CLASS = None
+        _GRAD_OVERRIDE_CHECKFLAG = 0
+        self.keras_phase_placeholder = None
+        return method
+
+    def explain(self, method, T, X, xs, ys=None, batch_size=None, **kwargs):
+        explainer = self.get_explainer(method, T, X, **kwargs)
+        return explainer.run(xs, ys, batch_size)
+
+    @staticmethod
+    def get_override_map():
+        return dict((a, 'DeepExplainGrad') for a in SUPPORTED_ACTIVATIONS)
+
+    def _check_ops(self):
+        """
+        Heuristically check if any op is in the list of unsupported activation functions.
+        This does not cover all cases where explanation methods would fail, and must be improved in the future.
+        Also, check if the placeholder named 'keras_learning_phase' exists in the graph. This is used by Keras
+         and needs to be passed in feed_dict.
+        :return:
+        """
+        g = tf.get_default_graph()
+        for op in g.get_operations():
+            if len(op.inputs) > 0 and not op.name.startswith('gradients'):
+                if op.type in UNSUPPORTED_ACTIVATIONS:
+                    warnings.warn('Detected unsupported activation (%s). '
+                                  'This might lead to unexpected or wrong results.' % op.type)
+            elif 'keras_learning_phase' in op.name:
+                self.keras_phase_placeholder = op.outputs[0]
\ No newline at end of file
diff --git a/easy_rec/python/tools/explainer/utils.py b/easy_rec/python/tools/explainer/utils.py
new file mode 100644
index 000000000..b697bf230
--- /dev/null
+++ b/easy_rec/python/tools/explainer/utils.py
@@ -0,0 +1,69 @@
+import numpy as np
+import tensorflow as tf
+
+# Some of the following functions for batch processing have been borrowed and adapter from Keras
+# https://github.com/keras-team/keras/blob/master/keras/utils/generic_utils.py
+# https://github.com/keras-team/keras/blob/master/keras/engine/training_utils.py
+
+
+def make_batches(size, batch_size):
+    """Returns a list of batch indices (tuples of indices).
+    # Arguments
+        size: Integer, total size of the data to slice into batches.
+        batch_size: Integer, batch size.
+    # Returns
+        A list of tuples of array indices.
+    """
+    num_batches = (size + batch_size - 1) // batch_size  # round up
+    return [(i * batch_size, min(size, (i + 1) * batch_size))
+            for i in range(num_batches)]
+
+
+def to_list(x, allow_tuple=False):
+    """Normalizes a list/tensor into a list.
+    If a tensor is passed, we return
+    a list of size 1 containing the tensor.
+    # Arguments
+        x: target object to be normalized.
+        allow_tuple: If False and x is a tuple,
+            it will be converted into a list
+            with a single element (the tuple).
+            Else converts the tuple to a list.
+    # Returns
+        A list.
+    """
+    if isinstance(x, list):
+        return x
+    if allow_tuple and isinstance(x, tuple):
+        return list(x)
+    return [x]
+
+
+def unpack_singleton(x):
+    """Gets the equivalent np-array if the iterable has only one value.
+    Otherwise return the iterable.
+    # Argument
+        x: A list or tuple.
+    # Returns
+        The same iterable or the iterable converted to a np-array.
+    """
+    if len(x) == 1:
+        return np.array(x)
+    return x
+
+
+def slice_arrays(arrays, start=None, stop=None):
+    """Slices an array or list of arrays.
+    """
+    if arrays is None:
+        return [None]
+    elif isinstance(arrays, list):
+        return [None if x is None else x[start:stop] for x in arrays]
+    else:
+        return arrays[start:stop]
+
+
+def placeholder_from_data(numpy_array):
+    if numpy_array is None:
+        return None
+    return tf.placeholder('float', [None,] + list(numpy_array.shape[1:]))
diff --git a/easy_rec/python/utils/activation.py b/easy_rec/python/utils/activation.py
index d05d705b3..a6ec1374f 100644
--- a/easy_rec/python/utils/activation.py
+++ b/easy_rec/python/utils/activation.py
@@ -4,34 +4,15 @@
 import numpy as np
 import six
 import tensorflow as tf
-from tensorflow.python.keras.layers import Layer
-
 from easy_rec.python.utils.load_class import load_by_path
 
-try:
-  from tensorflow.python.keras.layers import BatchNormalization
-except ImportError:
-  BatchNormalization = tf.keras.layers.BatchNormalization
-
-# try:
-#   from tensorflow.python.ops.init_ops import Zeros
-# except ImportError:
-#   from tensorflow.python.ops.init_ops_v2 import Zeros
-
 
-class Dice(Layer):
+def dice(_x, axis=-1, epsilon=1e-9, name='dice', training=True):
   """The Data Adaptive Activation Function in DIN.
 
   Which can be viewed as a generalization of PReLu, and can adaptively adjust the rectified point
    according to distribution of input data.
 
-  Input shape
-    - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis)
-     when using this layer as the first layer in a model.
-
-  Output shape
-    - Same shape as the input.
-
   Arguments
     - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis).
     - **epsilon** : Small float added to variance to avoid dividing by zero.
@@ -41,44 +22,18 @@ class Dice(Layer):
      Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
      ACM, 2018: 1059-1068.] (https://arxiv.org/pdf/1706.06978.pdf)
   """
-
-  def __init__(self,
-               feat_dim,
-               axis=-1,
-               epsilon=1e-9,
-               is_training=None,
-               **kwargs):
-    super(Dice, self).__init__(**kwargs)
-    self.axis = axis
-    self.epsilon = epsilon
-    self.is_training = is_training
-    self.bn = BatchNormalization(
-        axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
-    self.alphas = tf.Variable(tf.zeros([feat_dim]), dtype=tf.float32)
-
-  # def build(self, input_shape):
-  #   super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
-  #   self.bn = BatchNormalization(
-  #       axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
-  #   self.alphas = self.add_weight(
-  #       shape=(input_shape[-1],),
-  #       initializer=Zeros(),
-  #       dtype=tf.float32,
-  #       name='dice_alpha')  # name='alpha_'+self.name
-  #   self.uses_learning_phase = True
-
-  def call(self, inputs, **kwargs):
-    inputs_normed = self.bn(inputs, training=self.is_training)
-    x_p = tf.sigmoid(inputs_normed)
-    return self.alphas * (1.0 - x_p) * inputs + x_p * inputs
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self,):
-    config = {'axis': self.axis, 'epsilon': self.epsilon}
-    base_config = super(Dice, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  alphas = tf.get_variable('alpha_' + name, _x.get_shape()[-1],
+                           initializer=tf.constant_initializer(0.0),
+                           dtype=tf.float32)
+  inputs_normed = tf.layers.batch_normalization(
+    inputs=_x,
+    axis=axis,
+    epsilon=epsilon,
+    center=False,
+    scale=False,
+    training=training)
+  x_p = tf.sigmoid(inputs_normed)
+  return alphas * (1.0 - x_p) * _x + x_p * _x
 
 
 def gelu(x):
@@ -134,7 +89,7 @@ def get_activation(activation_string, **kwargs):
       return tf.nn.leaky_relu
     return tf.keras.layers.PReLU(**kwargs)
   elif act == 'dice':
-    return Dice(**kwargs)
+    return lambda x, name: dice(x, name=name, **kwargs)
   elif act == 'elu':
     return tf.nn.elu
   elif act == 'selu':
@@ -143,7 +98,7 @@ def get_activation(activation_string, **kwargs):
     return tf.tanh
   elif act == 'swish':
     if tf.__version__ < '1.13.0':
-      return lambda x: x * tf.sigmoid(x)
+      return lambda x, name: x * tf.sigmoid(x, name=name)
     return tf.nn.swish
   elif act == 'sigmoid':
     return tf.nn.sigmoid
diff --git a/easy_rec/python/utils/io_util.py b/easy_rec/python/utils/io_util.py
index 091e10e07..4c1c28550 100644
--- a/easy_rec/python/utils/io_util.py
+++ b/easy_rec/python/utils/io_util.py
@@ -97,7 +97,7 @@ def download(oss_or_url, dst_dir=''):
 def create_module_dir(dst_dir):
   if not os.path.exists(dst_dir):
     os.makedirs(dst_dir)
-    with open(os.path.join(dst_dir, '__init__.py'), 'w') as ofile:
+    with open(os.path.join(dst_dir, 'explainer.py'), 'w') as ofile:
       ofile.write('\n')
 
 

From e27b12137afbb1e77dad7ea4a8e863ac59c18ddc Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 1 May 2023 20:16:06 +0800
Subject: [PATCH 17/54] [feat]: add attention normalizer for din

---
 easy_rec/python/layers/din.py      | 10 +++++++---
 easy_rec/python/layers/dnn.py      |  2 +-
 easy_rec/python/protos/layer.proto |  2 ++
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/easy_rec/python/layers/din.py b/easy_rec/python/layers/din.py
index 81f661165..71c6e1ab4 100644
--- a/easy_rec/python/layers/din.py
+++ b/easy_rec/python/layers/din.py
@@ -55,9 +55,13 @@ def __call__(self, inputs, training=None, **kwargs):
     seq_mask = tf.expand_dims(seq_mask, 1)
     paddings = tf.ones_like(scores) * (-2**32 + 1)
     scores = tf.where(seq_mask, scores, paddings)  # [B, 1, L]
-    scores = scores / (seq_emb_size**0.5)
-    # normalization with softmax is abandoned according to the original paper
-    scores = tf.nn.sigmoid(scores)
+    if self.config.attention_normalizer == 'softmax':
+      scores = tf.nn.softmax(scores)  # (B, 1, L)
+    elif self.config.attention_normalizer == 'sigmoid':
+      scores = scores / (seq_emb_size**0.5)
+      scores = tf.nn.sigmoid(scores)
+    else:
+      raise ValueError("unsupported attention normalizer: " + self.config.attention_normalizer)
 
     if target_emb_size < seq_emb_size:
       keys = keys[:, :, :target_emb_size]  # [B, L, E]
diff --git a/easy_rec/python/layers/dnn.py b/easy_rec/python/layers/dnn.py
index d2af5a4cf..ce36dd677 100644
--- a/easy_rec/python/layers/dnn.py
+++ b/easy_rec/python/layers/dnn.py
@@ -34,7 +34,7 @@ def __init__(self,
     self._name = name
     self._is_training = is_training
     logging.info('dnn activation function = %s' % self._config.activation)
-    self.activation = get_activation(self._config.activation, is_training=is_training)
+    self.activation = get_activation(self._config.activation, training=is_training)
     self._last_layer_no_activation = last_layer_no_activation
     self._last_layer_no_batch_norm = last_layer_no_batch_norm
 
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index e2ca2e217..4ddacac5e 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -118,4 +118,6 @@ message DINEncoder {
     required DNN attention_dnn = 1;
     // whether to keep target item feature
     required bool need_target_feature = 2 [default = true];
+    // option: softmax, sigmoid
+    required string attention_normalizer = 3 [default = 'softmax'];
 }

From e834050f7c8e5fb41b0a3890ee516197d43c79a3 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Thu, 4 May 2023 12:47:22 +0800
Subject: [PATCH 18/54] [feat]: add dice activation

---
 easy_rec/python/utils/activation.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/easy_rec/python/utils/activation.py b/easy_rec/python/utils/activation.py
index a6ec1374f..185dee622 100644
--- a/easy_rec/python/utils/activation.py
+++ b/easy_rec/python/utils/activation.py
@@ -6,6 +6,9 @@
 import tensorflow as tf
 from easy_rec.python.utils.load_class import load_by_path
 
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
 
 def dice(_x, axis=-1, epsilon=1e-9, name='dice', training=True):
   """The Data Adaptive Activation Function in DIN.

From 05d0e6447bc22396f23800d77eaeb75e80b1d575 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 5 May 2023 08:32:50 +0800
Subject: [PATCH 19/54] [feat]: add dice activation for dnn layer

---
 .../feature_column/feature_column_v2.py       |  64 ++++++
 easy_rec/python/layers/fscd_layer.py          | 192 ++++++++++++++++++
 easy_rec/python/protos/feature_config.proto   |   3 +
 .../python/protos/variational_dropout.proto   |   6 +
 4 files changed, 265 insertions(+)
 create mode 100644 easy_rec/python/layers/fscd_layer.py

diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py
index e1e4d9304..4610f8e52 100644
--- a/easy_rec/python/compat/feature_column/feature_column_v2.py
+++ b/easy_rec/python/compat/feature_column/feature_column_v2.py
@@ -3377,6 +3377,38 @@ def raw_name(self):
     """See `FeatureColumn` base class."""
     return self.categorical_column.raw_name
 
+  @property
+  def cardinality(self):
+    fc = self.categorical_column
+    if isinstance(fc, HashedCategoricalColumn) or isinstance(fc, CrossedColumn):
+      return fc.hash_bucket_size
+
+    if isinstance(fc, IdentityCategoricalColumn):
+      return fc.num_buckets
+
+    if isinstance(fc, BucketizedColumn):
+      return len(fc.boundaries) + 1
+
+    if isinstance(fc, VocabularyListCategoricalColumn):
+      return len(fc.vocabulary_list) + fc.num_oov_buckets
+
+    if isinstance(fc, VocabularyFileCategoricalColumn):
+      return len(fc.vocabulary_size) + fc.num_oov_buckets
+
+    if isinstance(fc, WeightedCategoricalColumn):
+      sub_fc = fc.categorical_column
+      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(sub_fc, CrossedColumn):
+        return sub_fc.hash_bucket_size
+      if isinstance(sub_fc, IdentityCategoricalColumn):
+        return sub_fc.num_buckets
+      if isinstance(sub_fc, VocabularyListCategoricalColumn):
+        return len(sub_fc.vocabulary_list) + fc.num_oov_buckets
+      if isinstance(sub_fc, VocabularyFileCategoricalColumn):
+        return len(sub_fc.vocabulary_size) + fc.num_oov_buckets
+      if isinstance(sub_fc, BucketizedColumn):
+        return len(sub_fc.boundaries) + 1
+    return 1
+
   @property
   def parse_example_spec(self):
     """See `FeatureColumn` base class."""
@@ -3727,6 +3759,38 @@ def raw_name(self):
     """See `FeatureColumn` base class."""
     return self.categorical_column.raw_name
 
+  @property
+  def cardinality(self):
+    fc = self.categorical_column
+    if isinstance(fc, HashedCategoricalColumn) or isinstance(fc, CrossedColumn):
+      return fc.hash_bucket_size
+
+    if isinstance(fc, IdentityCategoricalColumn):
+      return fc.num_buckets
+
+    if isinstance(fc, BucketizedColumn):
+      return len(fc.boundaries) + 1
+
+    if isinstance(fc, VocabularyListCategoricalColumn):
+      return len(fc.vocabulary_list) + fc.num_oov_buckets
+
+    if isinstance(fc, VocabularyFileCategoricalColumn):
+      return len(fc.vocabulary_size) + fc.num_oov_buckets
+
+    if isinstance(fc, WeightedCategoricalColumn):
+      sub_fc = fc.categorical_column
+      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(sub_fc, CrossedColumn):
+        return sub_fc.hash_bucket_size
+      if isinstance(sub_fc, IdentityCategoricalColumn):
+        return sub_fc.num_buckets
+      if isinstance(sub_fc, VocabularyListCategoricalColumn):
+        return len(sub_fc.vocabulary_list) + fc.num_oov_buckets
+      if isinstance(sub_fc, VocabularyFileCategoricalColumn):
+        return len(sub_fc.vocabulary_size) + fc.num_oov_buckets
+      if isinstance(sub_fc, BucketizedColumn):
+        return len(sub_fc.boundaries) + 1
+    return 1
+
   @property
   def parse_example_spec(self):
     """See `FeatureColumn` base class."""
diff --git a/easy_rec/python/layers/fscd_layer.py b/easy_rec/python/layers/fscd_layer.py
new file mode 100644
index 000000000..96ea5fd5c
--- /dev/null
+++ b/easy_rec/python/layers/fscd_layer.py
@@ -0,0 +1,192 @@
+# -*- encoding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from collections import OrderedDict
+import json
+import math
+
+import numpy as np
+import tensorflow as tf
+
+from easy_rec.python.compat.feature_column.feature_column import _SharedEmbeddingColumn  # NOQA
+from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn  # NOQA
+from easy_rec.python.compat.feature_column.feature_column_v2 import SharedEmbeddingColumn  # NOQA
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class FSCDLayer(object):
+  """Rank features by variational dropout.
+
+  paper: Towards a Better Tradeoff between Effectiveness and Efficiency in Pre-Ranking,
+    A Learnable Feature Selection based Approach
+  arXiv: 2105.07706
+  """
+
+  def __init__(self,
+               feature_configs,
+               variational_dropout_config,
+               features_dimension,
+               is_training=False,
+               name=''):
+    self._config = variational_dropout_config
+    self.features_dimension = features_dimension
+    self.features_total_dimension = sum(self.features_dimension.values())
+
+    self._dropout_param_size = len(self.features_dimension)
+    self.drop_param_shape = [self._dropout_param_size]
+    self.evaluate = not is_training
+
+    delta_name = 'delta' if name == 'all' else 'delta_%s' % name
+    self.delta = tf.get_variable(
+        name=delta_name,
+        shape=self.drop_param_shape,
+        dtype=tf.float32,
+        initializer=None)
+    tf.add_to_collection(
+        'variational_dropout',
+        json.dumps([name, list(self.features_dimension.items())]))
+
+    if variational_dropout_config.regularize_by_feature_complexity:
+      self.regular_params = self.get_feature_regular_params(feature_configs)
+    self.feature_complexity = {}
+
+  def get_feature_regular_params(self, feature_configs):
+    feature_regularize = {}
+    for config in feature_configs:
+      name = config.input_names[0]
+      if config.HasField('feature_name'):
+        name = config.feature_name
+
+      complexity = self._config.feature_complexity_weight * config.complexity
+
+      # dim = 1.0
+      # if config.HasField('embedding_dim'):
+      #   dim = float(config.embedding_dim)
+      dim = self.features_dimension[name]
+      complexity += self._config.feature_dimension_weight * dim
+
+      cardinal = 1.0
+      if config.HasField('hash_bucket_size'):
+        cardinal = float(config.hash_bucket_size)
+      elif config.HasField('num_buckets'):
+        cardinal = float(config.num_buckets)
+      elif len(config.boundaries) > 0:
+        cardinal = float(len(config.boundaries) + 1)
+      complexity += self._config.feature_cardinality_weight * cardinal
+
+      theta = 1.0 - sigmoid(complexity)
+      alpha = math.log(1.0 - theta) - math.log(theta)
+      feature_regularize[name] = alpha
+
+    return feature_regularize
+
+  def get_lambda(self):
+    return self._config.regularization_lambda
+
+  def build_expand_index(self, batch_size):
+    # Build index_list--->[[0,0],[0,0],[0,0],[0,0],[0,1]......]
+    expanded_index = []
+    for i, index_loop_count in enumerate(self.features_dimension.values()):
+      for m in range(index_loop_count):
+        expanded_index.append([i])
+    expanded_index = tf.tile(expanded_index, [batch_size, 1])
+    batch_size_range = tf.range(batch_size)
+    expand_range_axis = tf.expand_dims(batch_size_range, 1)
+    batch_size_range_expand_dim_len = tf.tile(
+        expand_range_axis, [1, self.features_total_dimension])
+    index_i = tf.reshape(batch_size_range_expand_dim_len, [-1, 1])
+    expanded_index = tf.concat([index_i, expanded_index], 1)
+    return expanded_index
+
+  def sample_noisy_input(self, input):
+    batch_size = tf.shape(input)[0]
+    if self.evaluate:
+      expanded_dims_logit_p = tf.expand_dims(self.logit_p, 0)
+      expanded_logit_p = tf.tile(expanded_dims_logit_p, [batch_size, 1])
+      p = tf.sigmoid(expanded_logit_p)
+      if self.variational_dropout_wise():
+        scaled_input = input * (1 - p)
+      else:
+        # expand dropout layer
+        expanded_index = self.build_expand_index(batch_size)
+        expanded_p = tf.gather_nd(p, expanded_index)
+        expanded_p = tf.reshape(expanded_p, [-1, self.features_total_dimension])
+        scaled_input = input * (1 - expanded_p)
+
+      return scaled_input
+    else:
+      bern_val = self.sampled_from_logit_p(batch_size)
+      bern_val = tf.reshape(bern_val, [-1, self.features_total_dimension])
+      noisy_input = input * bern_val
+      return noisy_input
+
+  def sampled_from_logit_p(self, num_samples):
+    expand_dims_logit_p = tf.expand_dims(self.logit_p, 0)
+    expand_logit_p = tf.tile(expand_dims_logit_p, [num_samples, 1])
+    dropout_p = tf.sigmoid(expand_logit_p)
+    bern_val = self.concrete_dropout_neuron(dropout_p)
+
+    if self.variational_dropout_wise():
+      return bern_val, bern_val
+    else:
+      # from feature_num to embedding_dim_num
+      expanded_index = self.build_expand_index(num_samples)
+      bern_val_gather_nd = tf.gather_nd(bern_val, expanded_index)
+      return bern_val_gather_nd, bern_val
+
+  def concrete_dropout_neuron(self, dropout_p, temp=1.0 / 10.0):
+    EPSILON = np.finfo(float).eps
+    unif_noise = tf.random_uniform(
+        tf.shape(dropout_p), dtype=tf.float32, seed=None, name='unif_noise')
+
+    approx = (
+        tf.log(dropout_p + EPSILON) - tf.log(1. - dropout_p + EPSILON) +
+        tf.log(unif_noise + EPSILON) - tf.log(1. - unif_noise + EPSILON))
+
+    approx_output = tf.sigmoid(approx / temp)
+    return 1 - approx_output
+
+  def compute_regular_params(self, cols_to_feature):
+    alphas = OrderedDict()
+    for fc, fea in cols_to_feature.items():
+      dim = int(fea.shape[-1])
+      complexity = self.feature_complexity[fc.raw_name]
+      cardinal = 1
+      if isinstance(fc, EmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
+        cardinal = fc.cardinality
+      c = self._config.feature_complexity_weight * complexity
+      c += self._config.feature_cardinality_weight * cardinal
+      c += self._config.feature_dimension_weight * dim
+      theta = 1.0 - sigmoid(complexity)
+      alpha = math.log(1.0 - theta) - math.log(theta)
+      alphas[fc] = alpha
+    return alphas
+
+  def __call__(self, cols_to_feature):
+    """
+    cols_to_feature: an ordered dict mapping feature_column to feature_values
+    """
+    alphas = self.compute_regular_params(cols_to_feature)
+    feature_columns = cols_to_feature.keys()
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      value = cols_to_feature[column]
+
+    batch_size = tf.shape(output_features)[0]
+    noisy_input, z = self.sample_noisy_input(output_features)
+    dropout_p = tf.sigmoid(self.logit_p)
+    variational_dropout_penalty = 1. - dropout_p
+    if self._config.regularize_by_feature_complexity:
+      pass
+    else:
+      variational_dropout_penalty_lambda = self.get_lambda() / tf.cast(
+        batch_size, dtype=tf.float32)
+      variational_dropout_loss_sum = variational_dropout_penalty_lambda * tf.reduce_sum(
+        variational_dropout_penalty, axis=0)
+    tf.add_to_collection('variational_dropout_loss',
+                         variational_dropout_loss_sum)
+    return noisy_input
+
+
+def sigmoid(x):
+  return x / (1 + math.exp(-x))
diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto
index 5ed305c10..b642fff23 100644
--- a/easy_rec/python/protos/feature_config.proto
+++ b/easy_rec/python/protos/feature_config.proto
@@ -128,6 +128,9 @@ message FeatureConfig {
 
     // embedding variable params
     optional EVParams ev_params = 31;
+
+    // fg complexity
+    optional float complexity = 32 [default = 1.0];
 }
 
 message FeatureConfigV2 {
diff --git a/easy_rec/python/protos/variational_dropout.proto b/easy_rec/python/protos/variational_dropout.proto
index e72ca54c6..afe4d061c 100644
--- a/easy_rec/python/protos/variational_dropout.proto
+++ b/easy_rec/python/protos/variational_dropout.proto
@@ -7,4 +7,10 @@ message  VariationalDropoutLayer{
     optional float regularization_lambda = 1 [default = 0.01];
     // variational_dropout dimension
     optional bool embedding_wise_variational_dropout = 2 [default = false];
+    // whether to use FSCD model
+    optional bool regularize_by_feature_complexity = 3 [default = false];
+
+    optional float feature_complexity_weight = 4 [default = 1.0];
+    optional float feature_dimension_weight = 5 [default = 1e-2];
+    optional float feature_cardinality_weight = 6 [default = 1e-7];
 }

From 23962b23af7859b2691179e3ee962d405178dd4b Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 5 May 2023 14:10:18 +0800
Subject: [PATCH 20/54] [feat]: add FSCD layer

---
 easy_rec/python/layers/fscd_layer.py  | 187 +++++++++-----------------
 easy_rec/python/layers/input_layer.py |  54 +++++---
 2 files changed, 94 insertions(+), 147 deletions(-)

diff --git a/easy_rec/python/layers/fscd_layer.py b/easy_rec/python/layers/fscd_layer.py
index 96ea5fd5c..c8f94bc81 100644
--- a/easy_rec/python/layers/fscd_layer.py
+++ b/easy_rec/python/layers/fscd_layer.py
@@ -1,9 +1,7 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from collections import OrderedDict
-import json
 import math
-
 import numpy as np
 import tensorflow as tf
 
@@ -15,6 +13,38 @@
   tf = tf.compat.v1
 
 
+def get_feature_complexity(feature_configs):
+  feature_complexity = {}
+  for config in feature_configs:
+    name = config.input_names[0]
+    if config.HasField('feature_name'):
+      name = config.feature_name
+    feature_complexity[name] = config.complexity
+
+    # complexity = self._config.feature_complexity_weight * config.complexity
+    #
+    # # dim = 1.0
+    # # if config.HasField('embedding_dim'):
+    # #   dim = float(config.embedding_dim)
+    # dim = self.features_dimension[name]
+    # complexity += self._config.feature_dimension_weight * dim
+    #
+    # cardinal = 1.0
+    # if config.HasField('hash_bucket_size'):
+    #   cardinal = float(config.hash_bucket_size)
+    # elif config.HasField('num_buckets'):
+    #   cardinal = float(config.num_buckets)
+    # elif len(config.boundaries) > 0:
+    #   cardinal = float(len(config.boundaries) + 1)
+    # complexity += self._config.feature_cardinality_weight * cardinal
+    #
+    # theta = 1.0 - sigmoid(complexity)
+    # alpha = math.log(1.0 - theta) - math.log(theta)
+    # feature_regularize[name] = alpha
+
+  return feature_complexity
+
+
 class FSCDLayer(object):
   """Rank features by variational dropout.
 
@@ -26,126 +56,28 @@ class FSCDLayer(object):
   def __init__(self,
                feature_configs,
                variational_dropout_config,
-               features_dimension,
                is_training=False,
                name=''):
     self._config = variational_dropout_config
-    self.features_dimension = features_dimension
-    self.features_total_dimension = sum(self.features_dimension.values())
+    self.is_training = is_training
+    self.name = name
+    self.feature_complexity = get_feature_complexity(feature_configs)
 
-    self._dropout_param_size = len(self.features_dimension)
-    self.drop_param_shape = [self._dropout_param_size]
-    self.evaluate = not is_training
-
-    delta_name = 'delta' if name == 'all' else 'delta_%s' % name
-    self.delta = tf.get_variable(
+  def compute_dropout_mask(self, n, temperature=0.1):
+    delta_name = 'delta' if self.name == 'all' else 'delta_%s' % self.name
+    delta = tf.get_variable(
         name=delta_name,
-        shape=self.drop_param_shape,
+        shape=[n],
         dtype=tf.float32,
-        initializer=None)
-    tf.add_to_collection(
-        'variational_dropout',
-        json.dumps([name, list(self.features_dimension.items())]))
-
-    if variational_dropout_config.regularize_by_feature_complexity:
-      self.regular_params = self.get_feature_regular_params(feature_configs)
-    self.feature_complexity = {}
-
-  def get_feature_regular_params(self, feature_configs):
-    feature_regularize = {}
-    for config in feature_configs:
-      name = config.input_names[0]
-      if config.HasField('feature_name'):
-        name = config.feature_name
-
-      complexity = self._config.feature_complexity_weight * config.complexity
-
-      # dim = 1.0
-      # if config.HasField('embedding_dim'):
-      #   dim = float(config.embedding_dim)
-      dim = self.features_dimension[name]
-      complexity += self._config.feature_dimension_weight * dim
-
-      cardinal = 1.0
-      if config.HasField('hash_bucket_size'):
-        cardinal = float(config.hash_bucket_size)
-      elif config.HasField('num_buckets'):
-        cardinal = float(config.num_buckets)
-      elif len(config.boundaries) > 0:
-        cardinal = float(len(config.boundaries) + 1)
-      complexity += self._config.feature_cardinality_weight * cardinal
+        initializer=tf.constant_initializer(0.5))
 
-      theta = 1.0 - sigmoid(complexity)
-      alpha = math.log(1.0 - theta) - math.log(theta)
-      feature_regularize[name] = alpha
-
-    return feature_regularize
-
-  def get_lambda(self):
-    return self._config.regularization_lambda
-
-  def build_expand_index(self, batch_size):
-    # Build index_list--->[[0,0],[0,0],[0,0],[0,0],[0,1]......]
-    expanded_index = []
-    for i, index_loop_count in enumerate(self.features_dimension.values()):
-      for m in range(index_loop_count):
-        expanded_index.append([i])
-    expanded_index = tf.tile(expanded_index, [batch_size, 1])
-    batch_size_range = tf.range(batch_size)
-    expand_range_axis = tf.expand_dims(batch_size_range, 1)
-    batch_size_range_expand_dim_len = tf.tile(
-        expand_range_axis, [1, self.features_total_dimension])
-    index_i = tf.reshape(batch_size_range_expand_dim_len, [-1, 1])
-    expanded_index = tf.concat([index_i, expanded_index], 1)
-    return expanded_index
-
-  def sample_noisy_input(self, input):
-    batch_size = tf.shape(input)[0]
-    if self.evaluate:
-      expanded_dims_logit_p = tf.expand_dims(self.logit_p, 0)
-      expanded_logit_p = tf.tile(expanded_dims_logit_p, [batch_size, 1])
-      p = tf.sigmoid(expanded_logit_p)
-      if self.variational_dropout_wise():
-        scaled_input = input * (1 - p)
-      else:
-        # expand dropout layer
-        expanded_index = self.build_expand_index(batch_size)
-        expanded_p = tf.gather_nd(p, expanded_index)
-        expanded_p = tf.reshape(expanded_p, [-1, self.features_total_dimension])
-        scaled_input = input * (1 - expanded_p)
-
-      return scaled_input
-    else:
-      bern_val = self.sampled_from_logit_p(batch_size)
-      bern_val = tf.reshape(bern_val, [-1, self.features_total_dimension])
-      noisy_input = input * bern_val
-      return noisy_input
-
-  def sampled_from_logit_p(self, num_samples):
-    expand_dims_logit_p = tf.expand_dims(self.logit_p, 0)
-    expand_logit_p = tf.tile(expand_dims_logit_p, [num_samples, 1])
-    dropout_p = tf.sigmoid(expand_logit_p)
-    bern_val = self.concrete_dropout_neuron(dropout_p)
-
-    if self.variational_dropout_wise():
-      return bern_val, bern_val
-    else:
-      # from feature_num to embedding_dim_num
-      expanded_index = self.build_expand_index(num_samples)
-      bern_val_gather_nd = tf.gather_nd(bern_val, expanded_index)
-      return bern_val_gather_nd, bern_val
-
-  def concrete_dropout_neuron(self, dropout_p, temp=1.0 / 10.0):
     EPSILON = np.finfo(float).eps
-    unif_noise = tf.random_uniform(
-        tf.shape(dropout_p), dtype=tf.float32, seed=None, name='unif_noise')
+    unif_noise = tf.random_uniform([n], dtype=tf.float32, seed=None, name='uniform_noise')
 
     approx = (
-        tf.log(dropout_p + EPSILON) - tf.log(1. - dropout_p + EPSILON) +
+        tf.log(delta + EPSILON) - tf.log(1. - delta + EPSILON) +
         tf.log(unif_noise + EPSILON) - tf.log(1. - unif_noise + EPSILON))
-
-    approx_output = tf.sigmoid(approx / temp)
-    return 1 - approx_output
+    return tf.sigmoid(approx / temperature)
 
   def compute_regular_params(self, cols_to_feature):
     alphas = OrderedDict()
@@ -167,26 +99,29 @@ def __call__(self, cols_to_feature):
     """
     cols_to_feature: an ordered dict mapping feature_column to feature_values
     """
-    alphas = self.compute_regular_params(cols_to_feature)
+    output_tensors = []
+    alphas = []
+    z = self.compute_dropout_mask(len(cols_to_feature))  # keep ratio
+    regular = self.compute_regular_params(cols_to_feature)
     feature_columns = cols_to_feature.keys()
     for column in sorted(feature_columns, key=lambda x: x.name):
       value = cols_to_feature[column]
+      alpha = regular[column]
+      i = len(output_tensors)
+      out = value * z[i] if self.is_training else value
+      cols_to_feature[column] = out
+      output_tensors.append(out)
+      alphas.append(alpha)
+
+    output_features = tf.concat(output_tensors, 1)
 
     batch_size = tf.shape(output_features)[0]
-    noisy_input, z = self.sample_noisy_input(output_features)
-    dropout_p = tf.sigmoid(self.logit_p)
-    variational_dropout_penalty = 1. - dropout_p
-    if self._config.regularize_by_feature_complexity:
-      pass
-    else:
-      variational_dropout_penalty_lambda = self.get_lambda() / tf.cast(
-        batch_size, dtype=tf.float32)
-      variational_dropout_loss_sum = variational_dropout_penalty_lambda * tf.reduce_sum(
-        variational_dropout_penalty, axis=0)
-    tf.add_to_collection('variational_dropout_loss',
-                         variational_dropout_loss_sum)
-    return noisy_input
+    t_alpha = tf.convert_to_tensor(alphas)  # [M]
+    loss = tf.reduce_sum(t_alpha * z) / batch_size
+
+    tf.add_to_collection('variational_dropout_loss', loss)
+    return output_features
 
 
 def sigmoid(x):
-  return x / (1 + math.exp(-x))
+  return 1. / (1. + math.exp(-x))
diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py
index fa17a1c15..6900a9bda 100644
--- a/easy_rec/python/layers/input_layer.py
+++ b/easy_rec/python/layers/input_layer.py
@@ -12,6 +12,7 @@
 from easy_rec.python.feature_column.feature_group import FeatureGroup
 from easy_rec.python.layers import sequence_feature_layer
 from easy_rec.python.layers import variational_dropout_layer
+from easy_rec.python.layers.fscd_layer import FSCDLayer
 from easy_rec.python.layers.common_layers import text_cnn
 from easy_rec.python.protos.feature_config_pb2 import WideOrDeep
 from easy_rec.python.utils import shape_utils
@@ -37,6 +38,7 @@ def __init__(self,
                embedding_regularizer=None,
                kernel_regularizer=None,
                is_training=False):
+    self._feature_configs = feature_configs
     self._feature_groups = {
         x.group_name: FeatureGroup(x) for x in feature_groups_config
     }
@@ -182,12 +184,8 @@ def single_call_input_layer(self,
         group_columns,
         cols_to_output_tensors=cols_to_output_tensors,
         feature_name_to_output_tensors=feature_name_to_output_tensors)
-    # embedding_reg_lst = [output_features]
+
     embedding_reg_lst = []
-    for col, val in cols_to_output_tensors.items():
-      if isinstance(col, EmbeddingColumn) or isinstance(col,
-                                                        SharedEmbeddingColumn):
-        embedding_reg_lst.append(val)
     builder = feature_column._LazyBuilder(features)
     seq_features = []
     for column in sorted(group_seq_columns, key=lambda x: x.name):
@@ -226,30 +224,44 @@ def single_call_input_layer(self,
           cols_to_output_tensors[column] = cnn_feature
         else:
           raise NotImplementedError
+
     if self._variational_dropout_config is not None:
-      features_dimension = OrderedDict([
-          (k.raw_name, int(v.shape[-1]))
-          for k, v in cols_to_output_tensors.items()
-      ])
-      concat_features = array_ops.concat(
+      if self._variational_dropout_config.regularize_by_feature_complexity:
+        fscd = FSCDLayer(self._feature_configs, self._variational_dropout_config,
+                         is_training=self._is_training, name=group_name)
+        output_features = fscd(cols_to_output_tensors)
+        concat_features = array_ops.concat(
           [output_features] + seq_features, axis=-1)
-      variational_dropout = variational_dropout_layer.VariationalDropoutLayer(
-          self._variational_dropout_config,
-          features_dimension,
-          self._is_training,
-          name=group_name)
-      concat_features = variational_dropout(concat_features)
-      group_features = tf.split(
-          concat_features, list(features_dimension.values()), axis=-1)
+        group_features = [cols_to_output_tensors[x] for x in group_columns] + \
+                         [cols_to_output_tensors[x] for x in group_seq_columns]
+      else:
+        features_dimension = OrderedDict([
+            (k.raw_name, int(v.shape[-1]))
+            for k, v in cols_to_output_tensors.items()
+        ])
+        concat_features = array_ops.concat(
+            [output_features] + seq_features, axis=-1)
+        variational_dropout = variational_dropout_layer.VariationalDropoutLayer(
+            self._variational_dropout_config,
+            features_dimension,
+            self._is_training,
+            name=group_name)
+        concat_features = variational_dropout(concat_features)
+        group_features = tf.split(
+            concat_features, list(features_dimension.values()), axis=-1)
     else:
       concat_features = array_ops.concat(
           [output_features] + seq_features, axis=-1)
       group_features = [cols_to_output_tensors[x] for x in group_columns] + \
                        [cols_to_output_tensors[x] for x in group_seq_columns]
 
-      if embedding_reg_lst:
-        regularizers.apply_regularization(
-            self._embedding_regularizer, weights_list=embedding_reg_lst)
+    for fc, val in cols_to_output_tensors.items():
+      if isinstance(fc, EmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
+        embedding_reg_lst.append(val)
+
+    if embedding_reg_lst:
+      regularizers.apply_regularization(
+          self._embedding_regularizer, weights_list=embedding_reg_lst)
     return concat_features, group_features
 
   def get_wide_deep_dict(self):

From 5dfb29f6a198f460fe7109c0b51e3d047a359262 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 5 May 2023 16:23:30 +0800
Subject: [PATCH 21/54] [feat]: add dice activation for dnn layer

---
 .../compat/feature_column/feature_column.py   | 37 ++++++++++++++++++-
 .../feature_column/feature_column_v2.py       |  4 +-
 .../python/feature_column/feature_column.py   |  8 +---
 easy_rec/python/layers/input_layer.py         |  5 +--
 4 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/easy_rec/python/compat/feature_column/feature_column.py b/easy_rec/python/compat/feature_column/feature_column.py
index 1eb27717d..19b1a570b 100644
--- a/easy_rec/python/compat/feature_column/feature_column.py
+++ b/easy_rec/python/compat/feature_column/feature_column.py
@@ -167,6 +167,9 @@
 
 from easy_rec.python.compat import embedding_ops as ev_embedding_ops
 from easy_rec.python.compat.feature_column import utils as fc_utils
+from easy_rec.python.compat.feature_column.feature_column_v2 import HashedCategoricalColumn, BucketizedColumn,\
+  WeightedCategoricalColumn, SequenceWeightedCategoricalColumn, CrossedColumn, IdentityCategoricalColumn,\
+  VocabularyListCategoricalColumn, VocabularyFileCategoricalColumn
 
 
 def _internal_input_layer(features,
@@ -2530,7 +2533,39 @@ def name(self):
 
   @property
   def raw_name(self):
-    return self.categorical_column.name
+    return self.categorical_column.raw_name
+
+  @property
+  def cardinality(self):
+    fc = self.categorical_column
+    if isinstance(fc, HashedCategoricalColumn) or isinstance(fc, CrossedColumn):
+      return fc.hash_bucket_size
+
+    if isinstance(fc, IdentityCategoricalColumn):
+      return fc.num_buckets
+
+    if isinstance(fc, BucketizedColumn):
+      return len(fc.boundaries) + 1
+
+    if isinstance(fc, VocabularyListCategoricalColumn):
+      return len(fc.vocabulary_list) + fc.num_oov_buckets
+
+    if isinstance(fc, VocabularyFileCategoricalColumn):
+      return len(fc.vocabulary_size) + fc.num_oov_buckets
+
+    if isinstance(fc, WeightedCategoricalColumn) or isinstance(fc, SequenceWeightedCategoricalColumn):
+      sub_fc = fc.categorical_column
+      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(sub_fc, CrossedColumn):
+        return sub_fc.hash_bucket_size
+      if isinstance(sub_fc, IdentityCategoricalColumn):
+        return sub_fc.num_buckets
+      if isinstance(sub_fc, VocabularyListCategoricalColumn):
+        return len(sub_fc.vocabulary_list) + fc.num_oov_buckets
+      if isinstance(sub_fc, VocabularyFileCategoricalColumn):
+        return len(sub_fc.vocabulary_size) + fc.num_oov_buckets
+      if isinstance(sub_fc, BucketizedColumn):
+        return len(sub_fc.boundaries) + 1
+    return 1
 
   @property
   def _var_scope_name(self):
diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py
index 4610f8e52..13a175041 100644
--- a/easy_rec/python/compat/feature_column/feature_column_v2.py
+++ b/easy_rec/python/compat/feature_column/feature_column_v2.py
@@ -3395,7 +3395,7 @@ def cardinality(self):
     if isinstance(fc, VocabularyFileCategoricalColumn):
       return len(fc.vocabulary_size) + fc.num_oov_buckets
 
-    if isinstance(fc, WeightedCategoricalColumn):
+    if isinstance(fc, WeightedCategoricalColumn) or isinstance(fc, SequenceWeightedCategoricalColumn):
       sub_fc = fc.categorical_column
       if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(sub_fc, CrossedColumn):
         return sub_fc.hash_bucket_size
@@ -3777,7 +3777,7 @@ def cardinality(self):
     if isinstance(fc, VocabularyFileCategoricalColumn):
       return len(fc.vocabulary_size) + fc.num_oov_buckets
 
-    if isinstance(fc, WeightedCategoricalColumn):
+    if isinstance(fc, WeightedCategoricalColumn) or isinstance(fc, SequenceWeightedCategoricalColumn):
       sub_fc = fc.categorical_column
       if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(sub_fc, CrossedColumn):
         return sub_fc.hash_bucket_size
diff --git a/easy_rec/python/feature_column/feature_column.py b/easy_rec/python/feature_column/feature_column.py
index 04fc07baf..cc7cfbe77 100644
--- a/easy_rec/python/feature_column/feature_column.py
+++ b/easy_rec/python/feature_column/feature_column.py
@@ -331,12 +331,8 @@ def parse_tag_feature(self, config):
           default_value=0,
           feature_name=feature_name)
 
-    if len(config.input_names) > 1:
-      tag_fc = feature_column.weighted_categorical_column(
-          tag_fc, weight_feature_key=feature_name + '_w', dtype=tf.float32)
-    elif config.HasField('kv_separator'):
-      tag_fc = feature_column.weighted_categorical_column(
-          tag_fc, weight_feature_key=feature_name + '_w', dtype=tf.float32)
+    tag_fc = feature_column.weighted_categorical_column(
+        tag_fc, weight_feature_key=feature_name + '_w', dtype=tf.float32)
 
     if self.is_wide(config):
       self._add_wide_embedding_column(tag_fc, config)
diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py
index 6900a9bda..8098057ad 100644
--- a/easy_rec/python/layers/input_layer.py
+++ b/easy_rec/python/layers/input_layer.py
@@ -138,8 +138,7 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
         group_features = [cols_to_output_tensors[x] for x in group_columns]
 
         for col, val in cols_to_output_tensors.items():
-          if isinstance(col, EmbeddingColumn) or isinstance(
-              col, SharedEmbeddingColumn):
+          if isinstance(col, EmbeddingColumn) or isinstance(col, _SharedEmbeddingColumn) or isinstance(col, SharedEmbeddingColumn):
             embedding_reg_lst.append(val)
 
       builder = feature_column._LazyBuilder(features)
@@ -256,7 +255,7 @@ def single_call_input_layer(self,
                        [cols_to_output_tensors[x] for x in group_seq_columns]
 
     for fc, val in cols_to_output_tensors.items():
-      if isinstance(fc, EmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
+      if isinstance(fc, EmbeddingColumn) or isinstance(fc, _SharedEmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
         embedding_reg_lst.append(val)
 
     if embedding_reg_lst:

From 51428ce799dfa89284aae3e9859c4f832b47b23c Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 8 May 2023 20:05:22 +0800
Subject: [PATCH 22/54] [feat]: add dice activation for dnn layer

---
 docs/source/train.md                          |    8 +-
 .../compat/feature_column/feature_column.py   |   13 +-
 .../feature_column/feature_column_v2.py       |  213 +++-
 .../python/feature_column/feature_column.py   |   30 +-
 easy_rec/python/input/augment.py              |  113 +-
 easy_rec/python/input/input.py                |   37 +-
 easy_rec/python/layers/bst.py                 |   52 +-
 easy_rec/python/layers/din.py                 |    3 +-
 easy_rec/python/layers/fscd_layer.py          |  132 ++-
 easy_rec/python/layers/input_layer.py         |   21 +-
 .../layers/multihead_cross_attention.py       |    9 +-
 easy_rec/python/loss/nce_loss.py              |   51 +-
 easy_rec/python/model/easy_rec_model.py       |   24 +-
 easy_rec/python/protos/feature_config.proto   |    1 +
 .../python/protos/variational_dropout.proto   |    5 +-
 easy_rec/python/tools/explainer/deep_shap.py  |  420 ++++---
 easy_rec/python/tools/explainer/explainer.py  |  164 +--
 .../tools/explainer/feature_importance.py     |   13 +-
 easy_rec/python/tools/explainer/methods.py    | 1016 +++++++++--------
 easy_rec/python/tools/explainer/utils.py      |   97 +-
 easy_rec/python/utils/activation.py           |    2 +-
 setup.cfg                                     |    2 +-
 22 files changed, 1472 insertions(+), 954 deletions(-)

diff --git a/docs/source/train.md b/docs/source/train.md
index e58bb6862..67a79ad91 100644
--- a/docs/source/train.md
+++ b/docs/source/train.md
@@ -155,7 +155,7 @@ EasyRec支持两种损失函数配置方式：1）使用单个损失函数；2
 - PAIRWISE_FOCAL_LOSS 的参数配置
 
   - gamma: focal loss的指数，默认值2.0
-  - alpha: 调节样本权重的类别平衡参数，建议根据正负样本比例来配置alpha，  $\\frac{\\alpha}{1-\\alpha}=\\frac{#Neg}{#Pos}$
+  - alpha: 调节样本权重的类别平衡参数，建议根据正负样本比例来配置alpha，即 alpha / (1-alpha) = #Neg / #Pos
   - session_name: pair分组的字段名，比如user_id
   - hinge_margin: 当pair的logit之差大于该参数值时，当前样本的loss为0，默认值为1.0
   - ohem_ratio: 困难样本的百分比，只有部分困难样本参与loss计算，默认值为1.0
@@ -179,7 +179,7 @@ EasyRec支持两种损失函数配置方式：1）使用单个损失函数；2
 - BINARY_FOCAL_LOSS 的参数配置
 
   - gamma: focal loss的指数，默认值2.0
-  - alpha: 调节样本权重的类别平衡参数，建议根据正负样本比例来配置alpha，  $\\frac{\\alpha}{1-\\alpha}=\\frac{#Neg}{#Pos}$
+  - alpha: 调节样本权重的类别平衡参数，建议根据正负样本比例来配置alpha，即 alpha / (1-alpha) = #Neg / #Pos
   - ohem_ratio: 困难样本的百分比，只有部分困难样本参与loss计算，默认值为1.0
   - label_smoothing: 标签平滑系数
 
@@ -188,12 +188,12 @@ EasyRec支持两种损失函数配置方式：1）使用单个损失函数；2
   - alpha: ranking loss 与 calibration loss 的相对权重系数；不设置该值时，触发权重自适应学习
   - session_name: list分组的字段名，比如user_id
   - 参考论文：《 [Joint Optimization of Ranking and Calibration with Contextualized Hybrid Model](https://arxiv.org/pdf/2208.06164.pdf) 》
-  - 使用示例： [dbmtl_with_jrc_loss.config](https://github.com/alibaba/EasyRec/blob/master/samples/model_config/dbmtl_on_taobao_with_multi_loss.config)
+  - 使用示例: [dbmtl_with_jrc_loss.config](https://github.com/alibaba/EasyRec/blob/master/samples/model_config/dbmtl_on_taobao_with_multi_loss.config)
 
 排序模型同时使用多个损失函数的完整示例：
 [cmbf_with_multi_loss.config](https://github.com/alibaba/EasyRec/blob/master/samples/model_config/cmbf_with_multi_loss.config)
 
-多目标排序模型同时使用多个损失函数的完整示例：
+多目标排序模型同时使用多个损失函数的完整示例:
 [dbmtl_with_multi_loss.config](https://github.com/alibaba/EasyRec/blob/master/samples/model_config/dbmtl_on_taobao_with_multi_loss.config)
 
 ##### 损失函数权重自适应学习
diff --git a/easy_rec/python/compat/feature_column/feature_column.py b/easy_rec/python/compat/feature_column/feature_column.py
index 19b1a570b..56d3357c7 100644
--- a/easy_rec/python/compat/feature_column/feature_column.py
+++ b/easy_rec/python/compat/feature_column/feature_column.py
@@ -167,9 +167,6 @@
 
 from easy_rec.python.compat import embedding_ops as ev_embedding_ops
 from easy_rec.python.compat.feature_column import utils as fc_utils
-from easy_rec.python.compat.feature_column.feature_column_v2 import HashedCategoricalColumn, BucketizedColumn,\
-  WeightedCategoricalColumn, SequenceWeightedCategoricalColumn, CrossedColumn, IdentityCategoricalColumn,\
-  VocabularyListCategoricalColumn, VocabularyFileCategoricalColumn
 
 
 def _internal_input_layer(features,
@@ -2537,6 +2534,10 @@ def raw_name(self):
 
   @property
   def cardinality(self):
+    from easy_rec.python.compat.feature_column.feature_column_v2 import HashedCategoricalColumn, BucketizedColumn, \
+      WeightedCategoricalColumn, SequenceWeightedCategoricalColumn, CrossedColumn, IdentityCategoricalColumn, \
+      VocabularyListCategoricalColumn, VocabularyFileCategoricalColumn
+
     fc = self.categorical_column
     if isinstance(fc, HashedCategoricalColumn) or isinstance(fc, CrossedColumn):
       return fc.hash_bucket_size
@@ -2553,9 +2554,11 @@ def cardinality(self):
     if isinstance(fc, VocabularyFileCategoricalColumn):
       return len(fc.vocabulary_size) + fc.num_oov_buckets
 
-    if isinstance(fc, WeightedCategoricalColumn) or isinstance(fc, SequenceWeightedCategoricalColumn):
+    if isinstance(fc, WeightedCategoricalColumn) or isinstance(
+        fc, SequenceWeightedCategoricalColumn):
       sub_fc = fc.categorical_column
-      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(sub_fc, CrossedColumn):
+      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(
+          sub_fc, CrossedColumn):
         return sub_fc.hash_bucket_size
       if isinstance(sub_fc, IdentityCategoricalColumn):
         return sub_fc.num_buckets
diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py
index 13a175041..a17ce8fdc 100644
--- a/easy_rec/python/compat/feature_column/feature_column_v2.py
+++ b/easy_rec/python/compat/feature_column/feature_column_v2.py
@@ -1328,6 +1328,83 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+def constant_numeric_column(key,
+                   shape=(1,),
+                   default_value=None,
+                   dtype=dtypes.float32,
+                   normalizer_fn=None,
+                   feature_name=None):
+  """Represents real valued or numerical features.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  columns = [price, ...]
+  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+
+  # or
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  columns = [bucketized_price, ...]
+  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    shape: An iterable of integers specifies the shape of the `Tensor`. An
+      integer can be given which means a single dimension `Tensor` with given
+      width. The `Tensor` representing the column will have the shape of
+      [batch_size] + `shape`.
+    default_value: A single value compatible with `dtype` or an iterable of
+      values compatible with `dtype` which the column takes on during
+      `tf.Example` parsing if data is missing. A default value of `None` will
+      cause `tf.io.parse_example` to fail if an example does not contain this
+      column. If a single value is provided, the same value will be applied as
+      the default value for every item. If an iterable of values is provided,
+      the shape of the `default_value` should be equal to the given `shape`.
+    dtype: defines the type of values. Default value is `tf.float32`. Must be a
+      non-quantized, real integer or floating point type.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
+
+  Returns:
+    A `NumericColumn`.
+
+  Raises:
+    TypeError: if any dimension in shape is not an int
+    ValueError: if any dimension in shape is not a positive integer
+    TypeError: if `default_value` is an iterable but not compatible with `shape`
+    TypeError: if `default_value` is not compatible with `dtype`.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  shape = _check_shape(shape, key)
+  if not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype must be convertible to float. '
+                     'dtype: {}, key: {}'.format(dtype, key))
+  default_value = fc_utils.check_default_value(shape, default_value, dtype, key)
+
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
+
+  fc_utils.assert_key_is_string(key)
+  return ConstantNumericColumn(
+      feature_name=feature_name,
+      key=key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
+
+
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -2619,6 +2696,130 @@ def _normalize_feature_columns(feature_columns):
   return sorted(feature_columns, key=lambda x: x.name)
 
 
+class ConstantNumericColumn(
+    DenseColumn,
+    fc_old._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple('ConstantNumericColumn',
+                           ('feature_name', 'key', 'shape', 'default_value',
+                            'dtype', 'normalizer_fn'))):
+  """see `numeric_column`."""
+
+  @property
+  def _is_v2_column(self):
+    return True
+
+  @property
+  def name(self):
+    """See `FeatureColumn` base class."""
+    return self.feature_name if self.feature_name else self.key
+
+  @property
+  def raw_name(self):
+    """See `FeatureColumn` base class."""
+    return self.key
+
+  @property
+  def parse_example_spec(self):
+    """See `FeatureColumn` base class."""
+    return {
+        self.key:
+            parsing_ops.FixedLenFeature(self.shape, self.dtype,
+                                        self.default_value)
+    }
+
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.parse_example_spec
+
+  def _transform_input_tensor(self, input_tensor):
+    def_val = 0 if self.default_value is None else self.default_value
+    return tf.constant(def_val, dtypes.float32, self.shape)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    input_tensor = inputs.get(self.key)
+    return self._transform_input_tensor(input_tensor)
+
+  def transform_feature(self, transformation_cache, state_manager):
+    """See `FeatureColumn` base class.
+
+    In this case, we apply the `normalizer_fn` to the input tensor.
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+
+    Returns:
+      Normalized input tensor.
+
+    Raises:
+      ValueError: If a SparseTensor is passed in.
+    """
+    input_tensor = transformation_cache.get(self.key, state_manager)
+    return self._transform_input_tensor(input_tensor)
+
+  @property
+  def variable_shape(self):
+    """See `DenseColumn` base class."""
+    return tensor_shape.TensorShape(self.shape)
+
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _variable_shape(self):
+    return self.variable_shape
+
+  def get_dense_tensor(self, transformation_cache, state_manager):
+    """Returns dense `Tensor` representing numeric feature.
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+
+    Returns:
+      Dense `Tensor` created within `transform_feature`.
+    """
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    return transformation_cache.get(self, state_manager)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    return inputs.get(self)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['normalizer_fn'] = utils.serialize_keras_object(self.normalizer_fn)
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+        config['normalizer_fn'], custom_objects=custom_objects)
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
+
 class NumericColumn(
     DenseColumn,
     fc_old._DenseColumn,  # pylint: disable=protected-access
@@ -3395,9 +3596,11 @@ def cardinality(self):
     if isinstance(fc, VocabularyFileCategoricalColumn):
       return len(fc.vocabulary_size) + fc.num_oov_buckets
 
-    if isinstance(fc, WeightedCategoricalColumn) or isinstance(fc, SequenceWeightedCategoricalColumn):
+    if isinstance(fc, WeightedCategoricalColumn) or isinstance(
+        fc, SequenceWeightedCategoricalColumn):
       sub_fc = fc.categorical_column
-      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(sub_fc, CrossedColumn):
+      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(
+          sub_fc, CrossedColumn):
         return sub_fc.hash_bucket_size
       if isinstance(sub_fc, IdentityCategoricalColumn):
         return sub_fc.num_buckets
@@ -3777,9 +3980,11 @@ def cardinality(self):
     if isinstance(fc, VocabularyFileCategoricalColumn):
       return len(fc.vocabulary_size) + fc.num_oov_buckets
 
-    if isinstance(fc, WeightedCategoricalColumn) or isinstance(fc, SequenceWeightedCategoricalColumn):
+    if isinstance(fc, WeightedCategoricalColumn) or isinstance(
+        fc, SequenceWeightedCategoricalColumn):
       sub_fc = fc.categorical_column
-      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(sub_fc, CrossedColumn):
+      if isinstance(sub_fc, HashedCategoricalColumn) or isinstance(
+          sub_fc, CrossedColumn):
         return sub_fc.hash_bucket_size
       if isinstance(sub_fc, IdentityCategoricalColumn):
         return sub_fc.num_buckets
diff --git a/easy_rec/python/feature_column/feature_column.py b/easy_rec/python/feature_column/feature_column.py
index cc7cfbe77..8f4a88913 100644
--- a/easy_rec/python/feature_column/feature_column.py
+++ b/easy_rec/python/feature_column/feature_column.py
@@ -129,6 +129,8 @@ def _cmp_embed_config(a, b):
           self.parse_sequence_feature(config)
         elif config.feature_type == config.ExprFeature:
           self.parse_expr_feature(config)
+        elif config.feature_type == config.ConstFeature:
+          self.parse_const_feature(config)
         else:
           assert False, 'invalid feature type: %s' % config.feature_type
       except FeatureKeyError:
@@ -331,8 +333,9 @@ def parse_tag_feature(self, config):
           default_value=0,
           feature_name=feature_name)
 
-    tag_fc = feature_column.weighted_categorical_column(
-        tag_fc, weight_feature_key=feature_name + '_w', dtype=tf.float32)
+    if len(config.input_names) > 1 or config.HasField('kv_separator'):
+      tag_fc = feature_column.weighted_categorical_column(
+          tag_fc, weight_feature_key=feature_name + '_w', dtype=tf.float32)
 
     if self.is_wide(config):
       self._add_wide_embedding_column(tag_fc, config)
@@ -396,9 +399,7 @@ def parse_raw_feature(self, config):
           self._deep_columns[feature_name] = fc
 
   def parse_expr_feature(self, config):
-    """Generate raw features columns.
-
-    if boundaries is set, will be converted to category_column first.
+    """Generate expression features columns.
 
     Args:
       config: instance of easy_rec.python.protos.feature_config_pb2.FeatureConfig
@@ -408,7 +409,24 @@ def parse_expr_feature(self, config):
     fc = feature_column.numeric_column(
         feature_name, shape=(1,), feature_name=feature_name)
     if self.is_wide(config):
-      self._add_wide_embedding_column(fc, config)
+      self._wide_columns[feature_name] = fc
+    if self.is_deep(config):
+      self._deep_columns[feature_name] = fc
+
+  def parse_const_feature(self, config):
+    """Generate constant features columns.
+
+    used for mask input features.
+
+    Args:
+      config: instance of easy_rec.python.protos.feature_config_pb2.FeatureConfig
+    """
+    feature_name = config.feature_name if config.HasField('feature_name') \
+        else config.input_names[0]
+    fc = feature_column.constant_numeric_column(
+        feature_name, shape=(config.embedding_dim,), feature_name=feature_name)
+    if self.is_wide(config):
+      self._wide_columns[feature_name] = fc
     if self.is_deep(config):
       self._deep_columns[feature_name] = fc
 
diff --git a/easy_rec/python/input/augment.py b/easy_rec/python/input/augment.py
index 47822c366..c9802c88c 100644
--- a/easy_rec/python/input/augment.py
+++ b/easy_rec/python/input/augment.py
@@ -1,6 +1,7 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import tensorflow as tf
+
 from easy_rec.python.utils.shape_utils import get_shape_list
 
 if tf.__version__ >= '2.0':
@@ -8,69 +9,83 @@
 
 
 def assign(input_tensor, position=None, value=None):
-    input_tensor[tuple(position)] = value
-    return input_tensor
+  input_tensor[tuple(position)] = value
+  return input_tensor
 
 
 def item_mask(aug_data, length, gamma=0.3):
-    length1 = tf.cast(length, dtype=tf.float32)
-    num_mask = tf.cast(tf.math.floor(length1 * gamma), dtype=tf.int32)
-    seq = tf.range(length, dtype=tf.int32)
-    mask_index = tf.random.shuffle(seq)[:num_mask]
-    masked_item_seq = aug_data
-    masked_item_seq = tf.py_func(assign, inp=[masked_item_seq, [mask_index], 0], Tout=masked_item_seq.dtype)
-    return masked_item_seq, length
+  length1 = tf.cast(length, dtype=tf.float32)
+  num_mask = tf.cast(tf.math.floor(length1 * gamma), dtype=tf.int32)
+  seq = tf.range(length, dtype=tf.int32)
+  mask_index = tf.random.shuffle(seq)[:num_mask]
+  masked_item_seq = aug_data
+  masked_item_seq = tf.py_func(
+      assign,
+      inp=[masked_item_seq, [mask_index], 0],
+      Tout=masked_item_seq.dtype)
+  return masked_item_seq, length
 
 
 def item_crop(aug_data, length, eta=0.6):
-    length1 = tf.cast(length, dtype=tf.float32)
-    max_length = tf.cast(get_shape_list(aug_data)[0], dtype=tf.int32)
-    embedding_size = get_shape_list(aug_data)[1]
+  length1 = tf.cast(length, dtype=tf.float32)
+  max_length = tf.cast(get_shape_list(aug_data)[0], dtype=tf.int32)
+  embedding_size = get_shape_list(aug_data)[1]
 
-    num_left = tf.cast(tf.math.floor(length1 * eta), dtype=tf.int32)
-    crop_begin = tf.random.uniform([1], minval=0, maxval=length - num_left, dtype=tf.int32)[0]
-    cropped_item_seq = tf.zeros([get_shape_list(aug_data)[0], embedding_size])
-    cropped_item_seq = tf.where(crop_begin + num_left < max_length,
-                                tf.concat([aug_data[crop_begin:crop_begin + num_left],
-                                           cropped_item_seq[:max_length - num_left]], axis=0),
-                                tf.concat([aug_data[crop_begin:], cropped_item_seq[:crop_begin]], axis=0))
-    return cropped_item_seq, num_left
+  num_left = tf.cast(tf.math.floor(length1 * eta), dtype=tf.int32)
+  crop_begin = tf.random.uniform([1],
+                                 minval=0,
+                                 maxval=length - num_left,
+                                 dtype=tf.int32)[0]
+  cropped_item_seq = tf.zeros([get_shape_list(aug_data)[0], embedding_size])
+  cropped_item_seq = tf.where(
+      crop_begin + num_left < max_length,
+      tf.concat([
+          aug_data[crop_begin:crop_begin + num_left],
+          cropped_item_seq[:max_length - num_left]
+      ],
+                axis=0),
+      tf.concat([aug_data[crop_begin:], cropped_item_seq[:crop_begin]], axis=0))
+  return cropped_item_seq, num_left
 
 
 def item_reorder(aug_data, length, beta=0.6):
-    length1 = tf.cast(length,dtype=tf.float32)
-    num_reorder = tf.cast(tf.math.floor(length1 * beta) ,dtype=tf.int32)
-    reorder_begin = tf.random.uniform([1], minval=0, maxval=length - num_reorder, dtype=tf.int32)[0]
-    shuffle_index = tf.range(reorder_begin, reorder_begin + num_reorder)
-    shuffle_index = tf.random.shuffle(shuffle_index)
-    x = tf.range(get_shape_list(aug_data)[0])
-    left = tf.slice(x, [0], [reorder_begin])
-    right = tf.slice(x, [reorder_begin + num_reorder], [-1])
-    reordered_item_index = tf.concat([left, shuffle_index, right], axis=0)
-    reordered_item_seq = tf.scatter_nd(tf.expand_dims(reordered_item_index, axis=1),
-                                       aug_data,
-                                       tf.shape(aug_data))
-    return reordered_item_seq, length
+  length1 = tf.cast(length, dtype=tf.float32)
+  num_reorder = tf.cast(tf.math.floor(length1 * beta), dtype=tf.int32)
+  reorder_begin = tf.random.uniform([1],
+                                    minval=0,
+                                    maxval=length - num_reorder,
+                                    dtype=tf.int32)[0]
+  shuffle_index = tf.range(reorder_begin, reorder_begin + num_reorder)
+  shuffle_index = tf.random.shuffle(shuffle_index)
+  x = tf.range(get_shape_list(aug_data)[0])
+  left = tf.slice(x, [0], [reorder_begin])
+  right = tf.slice(x, [reorder_begin + num_reorder], [-1])
+  reordered_item_index = tf.concat([left, shuffle_index, right], axis=0)
+  reordered_item_seq = tf.scatter_nd(
+      tf.expand_dims(reordered_item_index, axis=1), aug_data,
+      tf.shape(aug_data))
+  return reordered_item_seq, length
 
 
 def augment(x):
-    seq, length = x
-    flag = tf.range(3, dtype=tf.int32)
-    flag1 = tf.random.shuffle(flag)[:1][0]
-    aug_seq, aug_len = tf.cond(tf.equal(flag1, 0),
-                               lambda: item_crop(seq, length),
-                               lambda: tf.cond(tf.equal(flag1, 1),
-                                               lambda: item_mask(seq, length),
-                                               lambda: item_reorder(seq, length)))
+  seq, length = x
+  flag = tf.range(3, dtype=tf.int32)
+  flag1 = tf.random.shuffle(flag)[:1][0]
+  aug_seq, aug_len = tf.cond(
+      tf.equal(flag1, 0), lambda: item_crop(seq, length), lambda: tf.cond(
+          tf.equal(flag1, 1), lambda: item_mask(seq, length), lambda:
+          item_reorder(seq, length)))
 
-    return [aug_seq, aug_len]
+  return [aug_seq, aug_len]
 
 
 def input_aug_data(original_data, seq_len):
-    print("seq_len:", seq_len)
-    lengths = tf.cast(seq_len, dtype=tf.int32)
-    aug_seq1, aug_len1 = tf.map_fn(augment, elems=(original_data, lengths), dtype=[tf.float32, tf.int32])
-    aug_seq2, aug_len2 = tf.map_fn(augment, elems=(original_data, lengths), dtype=[tf.float32, tf.int32])
-    aug_seq1 = tf.reshape(aug_seq1, tf.shape(original_data))
-    aug_seq2 = tf.reshape(aug_seq2, tf.shape(original_data))
-    return aug_seq1, aug_seq2, aug_len1, aug_len2
+  print('seq_len:', seq_len)
+  lengths = tf.cast(seq_len, dtype=tf.int32)
+  aug_seq1, aug_len1 = tf.map_fn(
+      augment, elems=(original_data, lengths), dtype=[tf.float32, tf.int32])
+  aug_seq2, aug_len2 = tf.map_fn(
+      augment, elems=(original_data, lengths), dtype=[tf.float32, tf.int32])
+  aug_seq1 = tf.reshape(aug_seq1, tf.shape(original_data))
+  aug_seq2 = tf.reshape(aug_seq2, tf.shape(original_data))
+  return aug_seq1, aug_seq2, aug_len1, aug_len2
diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index 52581b4e2..d4a990c35 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -225,6 +225,19 @@ def should_stop(self, curr_epoch):
       total_epoch = 1
     return total_epoch is not None and curr_epoch >= total_epoch
 
+  def get_erase_features(self):
+    if self._pipeline_config is None:
+      return set()
+
+    config = self._pipeline_config.model_config.variational_dropout
+    if config is None:
+      return set()
+
+    top_k = config.fine_tune_use_top_k_features
+    from easy_rec.python.layers.fscd_layer import get_top_and_bottom_features
+    _, erase_features = get_top_and_bottom_features(self._pipeline_config, top_k)
+    return erase_features
+
   def create_multi_placeholders(self, export_config):
     """Create multiply placeholders on export, one for each feature.
 
@@ -252,6 +265,7 @@ def create_multi_placeholders(self, export_config):
           self._input_fields[fid] != sample_weight_field
       ]
 
+    erase_features = self.get_erase_features()
     inputs = {}
     for fid in effective_fids:
       input_name = self._input_fields[fid]
@@ -265,12 +279,20 @@ def create_multi_placeholders(self, export_config):
         tf_type = self._multi_value_types[input_name]
         logging.info('multi value input_name: %s, dtype: %s' %
                      (input_name, tf_type))
-        finput = tf.placeholder(tf_type, [None, None], name=placeholder_name)
+        if input_name in erase_features:
+          def_val = self.get_type_defaults(tf_type, self._input_field_defaults[fid])
+          finput = tf.placeholder_with_default(def_val, [None, None], name=placeholder_name)
+        else:
+          finput = tf.placeholder(tf_type, [None, None], name=placeholder_name)
       else:
         ftype = self._input_field_types[fid]
         tf_type = get_tf_type(ftype)
         logging.info('input_name: %s, dtype: %s' % (input_name, tf_type))
-        finput = tf.placeholder(tf_type, [None], name=placeholder_name)
+        if input_name in erase_features:
+          def_val = self.get_type_defaults(tf_type, self._input_field_defaults[fid])
+          finput = tf.placeholder_with_default(def_val, [None], name=placeholder_name)
+        else:
+          finput = tf.placeholder(tf_type, [None], name=placeholder_name)
       inputs[input_name] = finput
     features = {x: inputs[x] for x in inputs}
     features = self._preprocess(features)
@@ -302,11 +324,15 @@ def create_placeholders(self, export_config):
           len(effective_fids))
     input_vals = tf.reshape(
         input_vals, [-1, len(effective_fids)], name='input_reshape')
+
+    erase_features = self.get_erase_features()
     features = {}
     for tmp_id, fid in enumerate(effective_fids):
       ftype = self._input_field_types[fid]
       tf_type = get_tf_type(ftype)
       input_name = self._input_fields[fid]
+      if input_name in erase_features:
+        continue
       if tf_type in [tf.float32, tf.double, tf.int32, tf.int64]:
         features[input_name] = tf.string_to_number(
             input_vals[:, tmp_id],
@@ -472,6 +498,11 @@ def _parse_id_feature(self, fc, parsed_dict, field_dict):
               tf.int32,
               name='%s_str_2_int' % input_0)
 
+  def _parse_const_feature(self, fc, parsed_dict, field_dict):
+    input_0 = fc.input_names[0]
+    feature_name = fc.feature_name if fc.HasField('feature_name') else input_0
+    parsed_dict[feature_name] = field_dict[input_0]
+
   def _parse_raw_feature(self, fc, parsed_dict, field_dict):
     input_0 = fc.input_names[0]
     feature_name = fc.feature_name if fc.HasField('feature_name') else input_0
@@ -779,6 +810,8 @@ def _preprocess(self, field_dict):
         self._parse_id_feature(fc, parsed_dict, field_dict)
       elif feature_type == fc.ExprFeature:
         self._parse_expr_feature(fc, parsed_dict, field_dict)
+      elif feature_type == fc.ConstFeature:
+        self._parse_const_feature(fc, parsed_dict, field_dict)
       else:
         feature_name = fc.feature_name if fc.HasField(
             'feature_name') else fc.input_names[0]
diff --git a/easy_rec/python/layers/bst.py b/easy_rec/python/layers/bst.py
index c9cf7d8c9..9f2f78030 100644
--- a/easy_rec/python/layers/bst.py
+++ b/easy_rec/python/layers/bst.py
@@ -2,11 +2,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import tensorflow as tf
 
+from easy_rec.python.input.augment import input_aug_data
 from easy_rec.python.layers import multihead_cross_attention
+from easy_rec.python.loss.nce_loss import nce_loss
 from easy_rec.python.utils.activation import get_activation
 from easy_rec.python.utils.shape_utils import get_shape_list
-from easy_rec.python.loss.nce_loss import nce_loss
-from easy_rec.python.input.augment import input_aug_data
+
 # from tensorflow.python.keras.layers import Layer
 
 
@@ -20,31 +21,31 @@ def __init__(self, config, l2_reg, name='bst', **kwargs):
 
   def encode(self, seq_input, max_position):
     seq_fea = multihead_cross_attention.embedding_postprocessor(
-      seq_input,
-      position_embedding_name=self.name + '/position_embeddings',
-      max_position_embeddings=max_position,
-      reuse_position_embedding=tf.AUTO_REUSE)
+        seq_input,
+        position_embedding_name=self.name + '/position_embeddings',
+        max_position_embeddings=max_position,
+        reuse_position_embedding=tf.AUTO_REUSE)
 
     n = tf.count_nonzero(seq_input, axis=-1)
     seq_mask = tf.cast(n > 0, tf.int32)
 
     attention_mask = multihead_cross_attention.create_attention_mask_from_input_mask(
-      from_tensor=seq_fea, to_mask=seq_mask)
+        from_tensor=seq_fea, to_mask=seq_mask)
 
     hidden_act = get_activation(self.config.hidden_act)
     attention_fea = multihead_cross_attention.transformer_encoder(
-      seq_fea,
-      hidden_size=self.config.hidden_size,
-      num_hidden_layers=self.config.num_hidden_layers,
-      num_attention_heads=self.config.num_attention_heads,
-      attention_mask=attention_mask,
-      intermediate_size=self.config.intermediate_size,
-      intermediate_act_fn=hidden_act,
-      hidden_dropout_prob=self.config.hidden_dropout_prob,
-      attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
-      initializer_range=self.config.initializer_range,
-      name=self.name + '/bst',
-      reuse=tf.AUTO_REUSE)
+        seq_fea,
+        hidden_size=self.config.hidden_size,
+        num_hidden_layers=self.config.num_hidden_layers,
+        num_attention_heads=self.config.num_attention_heads,
+        attention_mask=attention_mask,
+        intermediate_size=self.config.intermediate_size,
+        intermediate_act_fn=hidden_act,
+        hidden_dropout_prob=self.config.hidden_dropout_prob,
+        attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
+        initializer_range=self.config.initializer_range,
+        name=self.name + '/bst',
+        reuse=tf.AUTO_REUSE)
     # attention_fea shape: [batch_size, seq_length, hidden_size]
     out_fea = attention_fea[:, 0, :]  # target feature
     print('bst output shape:', out_fea.shape)
@@ -84,11 +85,11 @@ def __call__(self, inputs, training=None, **kwargs):
     seq_len = seq_features[0][1]
 
     if self.config.need_contrastive_learning:
-      assert 'loss_dict' in kwargs, "no `loss_dict` in kwargs of bst layer: %s" % self.name
+      assert 'loss_dict' in kwargs, 'no `loss_dict` in kwargs of bst layer: %s' % self.name
       loss = self.contrastive_loss(seq_input, seq_len, max_position)
       if self.config.auto_contrastive_loss_weight:
         uncertainty = tf.Variable(
-          0, name='%s_contrastive_loss_weight' % self.name, dtype=tf.float32)
+            0, name='%s_contrastive_loss_weight' % self.name, dtype=tf.float32)
         loss = tf.exp(-uncertainty) * loss + 0.5 * uncertainty
       else:
         loss *= self.config.contrastive_loss_weight
@@ -102,10 +103,10 @@ def __call__(self, inputs, training=None, **kwargs):
                                             ' in feature group:' + self.name
       if target_size != self.config.hidden_size:
         target_feature = tf.layers.dense(
-          target_feature,
-          self.config.hidden_size,
-          activation=tf.nn.relu,
-          kernel_regularizer=self.l2_reg)
+            target_feature,
+            self.config.hidden_size,
+            activation=tf.nn.relu,
+            kernel_regularizer=self.l2_reg)
       # target_feature: [batch_size, 1, embed_size]
       target_feature = tf.expand_dims(target_feature, 1)
       # seq_input: [batch_size, seq_len+1, embed_size]
@@ -119,4 +120,3 @@ def contrastive_loss(self, seq_input, seq_len, max_position):
     seq_output2 = self.encode(aug_seq2, max_position)
     loss = nce_loss(seq_output1, seq_output2)
     return loss
-
diff --git a/easy_rec/python/layers/din.py b/easy_rec/python/layers/din.py
index 71c6e1ab4..18505bd44 100644
--- a/easy_rec/python/layers/din.py
+++ b/easy_rec/python/layers/din.py
@@ -61,7 +61,8 @@ def __call__(self, inputs, training=None, **kwargs):
       scores = scores / (seq_emb_size**0.5)
       scores = tf.nn.sigmoid(scores)
     else:
-      raise ValueError("unsupported attention normalizer: " + self.config.attention_normalizer)
+      raise ValueError('unsupported attention normalizer: ' +
+                       self.config.attention_normalizer)
 
     if target_emb_size < seq_emb_size:
       keys = keys[:, :, :target_emb_size]  # [B, L, E]
diff --git a/easy_rec/python/layers/fscd_layer.py b/easy_rec/python/layers/fscd_layer.py
index c8f94bc81..78849f162 100644
--- a/easy_rec/python/layers/fscd_layer.py
+++ b/easy_rec/python/layers/fscd_layer.py
@@ -1,10 +1,11 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from collections import OrderedDict
 import math
+import json
 import numpy as np
+import six
 import tensorflow as tf
-
+from tensorflow.python.framework.meta_graph import read_meta_graph_file
 from easy_rec.python.compat.feature_column.feature_column import _SharedEmbeddingColumn  # NOQA
 from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn  # NOQA
 from easy_rec.python.compat.feature_column.feature_column_v2 import SharedEmbeddingColumn  # NOQA
@@ -20,29 +21,53 @@ def get_feature_complexity(feature_configs):
     if config.HasField('feature_name'):
       name = config.feature_name
     feature_complexity[name] = config.complexity
+  return feature_complexity
 
-    # complexity = self._config.feature_complexity_weight * config.complexity
-    #
-    # # dim = 1.0
-    # # if config.HasField('embedding_dim'):
-    # #   dim = float(config.embedding_dim)
-    # dim = self.features_dimension[name]
-    # complexity += self._config.feature_dimension_weight * dim
-    #
-    # cardinal = 1.0
-    # if config.HasField('hash_bucket_size'):
-    #   cardinal = float(config.hash_bucket_size)
-    # elif config.HasField('num_buckets'):
-    #   cardinal = float(config.num_buckets)
-    # elif len(config.boundaries) > 0:
-    #   cardinal = float(len(config.boundaries) + 1)
-    # complexity += self._config.feature_cardinality_weight * cardinal
-    #
-    # theta = 1.0 - sigmoid(complexity)
-    # alpha = math.log(1.0 - theta) - math.log(theta)
-    # feature_regularize[name] = alpha
 
-  return feature_complexity
+def sigmoid(x):
+  return 1. / (1. + math.exp(-x))
+
+
+def get_top_and_bottom_features(pipeline_config, top_k):
+  assert pipeline_config.model_config.HasField(
+    'variational_dropout'), 'variational_dropout must be in model_config'
+
+  checkpoint_path = tf.train.latest_checkpoint(pipeline_config.model_dir)
+  meta_graph_def = read_meta_graph_file(checkpoint_path + '.meta')
+
+  features_map = dict()
+  for col_def in meta_graph_def.collection_def[
+    'variational_dropout'].bytes_list.value:
+    features = json.loads(col_def)
+    features_map.update(features)
+
+  top_features = set()
+  tf.logging.info('Reading checkpoint from %s ...' % checkpoint_path)
+  reader = tf.train.NewCheckpointReader(checkpoint_path)
+  for feature_group in pipeline_config.model_config.feature_groups:
+    group_name = feature_group.group_name
+    delta_name = 'fscd_delta_%s' % group_name
+    if not reader.has_tensor(delta_name):
+      continue
+    assert group_name in features_map, "%s not in feature map" % group_name
+    feature_dims = features_map[group_name]
+    delta = reader.get_tensor(delta_name)
+    values, indices = tf.nn.top_k(delta, top_k)
+    with tf.Session() as sess:
+      idx = indices.eval(session=sess)
+    for i in idx:
+      feature = feature_dims[i][0]
+      top_features.add(feature)
+
+  bottom_features = set()
+  for group_name, features in six.iteritems(features_map):
+    for name, dim in features:
+      if name not in top_features:
+        bottom_features.add(name)
+
+  print("selected top %d features:" % top_k, ','.join(top_features))
+  print("removed bottom features:", ','.join(bottom_features))
+  return top_features, bottom_features
 
 
 class FSCDLayer(object):
@@ -64,41 +89,70 @@ def __init__(self,
     self.feature_complexity = get_feature_complexity(feature_configs)
 
   def compute_dropout_mask(self, n, temperature=0.1):
-    delta_name = 'delta' if self.name == 'all' else 'delta_%s' % self.name
+    delta_name = 'fscd_delta_%s' % self.name
     delta = tf.get_variable(
-        name=delta_name,
-        shape=[n],
-        dtype=tf.float32,
-        initializer=tf.constant_initializer(0.5))
+      name=delta_name,
+      shape=[n],
+      dtype=tf.float32,
+      initializer=tf.constant_initializer(0.))
+    delta = tf.nn.sigmoid(delta)
 
     EPSILON = np.finfo(float).eps
-    unif_noise = tf.random_uniform([n], dtype=tf.float32, seed=None, name='uniform_noise')
-
+    unif_noise = tf.random_uniform([n],
+                                   dtype=tf.float32,
+                                   seed=None,
+                                   name='uniform_noise')
     approx = (
         tf.log(delta + EPSILON) - tf.log(1. - delta + EPSILON) +
         tf.log(unif_noise + EPSILON) - tf.log(1. - unif_noise + EPSILON))
     return tf.sigmoid(approx / temperature)
 
   def compute_regular_params(self, cols_to_feature):
-    alphas = OrderedDict()
+    alphas = {}
     for fc, fea in cols_to_feature.items():
       dim = int(fea.shape[-1])
       complexity = self.feature_complexity[fc.raw_name]
       cardinal = 1
-      if isinstance(fc, EmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
+      if isinstance(fc, EmbeddingColumn) or isinstance(
+          fc, _SharedEmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
         cardinal = fc.cardinality
       c = self._config.feature_complexity_weight * complexity
       c += self._config.feature_cardinality_weight * cardinal
       c += self._config.feature_dimension_weight * dim
-      theta = 1.0 - sigmoid(complexity)
-      alpha = math.log(1.0 - theta) - math.log(theta)
+      sig_c = sigmoid(c)
+      theta = 1.0 - sig_c
+      alpha = math.log(sig_c) - math.log(theta)
       alphas[fc] = alpha
+      print(str(fc.raw_name), "complexity:", complexity, "cardinality:", cardinal,
+            "dimension:", dim, "c:", c, "theta:", theta, "alpha:", alpha)
     return alphas
 
+  # def mask_bottom_features(self, cols_to_feature, top_k):
+  #   feature_map = tf.get_collection('variational_dropout')
+  #   features = feature_map[self.name]
+  #
+  #   delta_name = 'fscd_delta_%s' % self.name
+  #   graph = tf.get_default_graph()
+  #   delta = graph.get_tensor_by_name(delta_name)
+  #   values, indices = tf.nn.top_k(delta, top_k)
+  #
+  #   output_tensors = []
+  #   feature_columns = cols_to_feature.keys()
+  #   for column in sorted(feature_columns, key=lambda x: x.name):
+  #     value = cols_to_feature[column]
+  #     output_tensors.append(value)
+  #   return tf.concat(output_tensors, 1)
+
   def __call__(self, cols_to_feature):
     """
     cols_to_feature: an ordered dict mapping feature_column to feature_values
     """
+    # if self._config.HasField('fine_tune_use_top_k_features'):
+    #   k = self._config.fine_tune_use_top_k_features
+    #   assert k > 0, 'config `fine_tune_use_top_k_features` must be large than 0'
+    #   return self.mask_bottom_features(cols_to_feature, k)
+
+    feature_dimension = []
     output_tensors = []
     alphas = []
     z = self.compute_dropout_mask(len(cols_to_feature))  # keep ratio
@@ -112,16 +166,14 @@ def __call__(self, cols_to_feature):
       cols_to_feature[column] = out
       output_tensors.append(out)
       alphas.append(alpha)
+      feature_dimension.append((column.raw_name, int(value.shape[-1])))
 
     output_features = tf.concat(output_tensors, 1)
+    tf.add_to_collection('variational_dropout', json.dumps({self.name: feature_dimension}))
 
     batch_size = tf.shape(output_features)[0]
-    t_alpha = tf.convert_to_tensor(alphas)  # [M]
-    loss = tf.reduce_sum(t_alpha * z) / batch_size
+    t_alpha = tf.convert_to_tensor(alphas, dtype=tf.float32)
+    loss = tf.reduce_sum(t_alpha * z) / tf.to_float(batch_size)
 
     tf.add_to_collection('variational_dropout_loss', loss)
     return output_features
-
-
-def sigmoid(x):
-  return 1. / (1. + math.exp(-x))
diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py
index 8098057ad..7e28458d5 100644
--- a/easy_rec/python/layers/input_layer.py
+++ b/easy_rec/python/layers/input_layer.py
@@ -1,5 +1,6 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
 from collections import OrderedDict
 
 import tensorflow as tf
@@ -12,8 +13,8 @@
 from easy_rec.python.feature_column.feature_group import FeatureGroup
 from easy_rec.python.layers import sequence_feature_layer
 from easy_rec.python.layers import variational_dropout_layer
-from easy_rec.python.layers.fscd_layer import FSCDLayer
 from easy_rec.python.layers.common_layers import text_cnn
+from easy_rec.python.layers.fscd_layer import FSCDLayer
 from easy_rec.python.protos.feature_config_pb2 import WideOrDeep
 from easy_rec.python.utils import shape_utils
 
@@ -118,7 +119,7 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
         return concat_features, group_features
     else:  # return sequence feature in raw format instead of combine them
       if self._variational_dropout_config is not None:
-        raise ValueError(
+        logging.warn(
             'variational dropout is not supported in not combined mode now.')
 
       feature_group = self._feature_groups[group_name]
@@ -138,7 +139,9 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
         group_features = [cols_to_output_tensors[x] for x in group_columns]
 
         for col, val in cols_to_output_tensors.items():
-          if isinstance(col, EmbeddingColumn) or isinstance(col, _SharedEmbeddingColumn) or isinstance(col, SharedEmbeddingColumn):
+          if isinstance(col, EmbeddingColumn) or isinstance(
+              col, _SharedEmbeddingColumn) or isinstance(
+                  col, SharedEmbeddingColumn):
             embedding_reg_lst.append(val)
 
       builder = feature_column._LazyBuilder(features)
@@ -226,11 +229,14 @@ def single_call_input_layer(self,
 
     if self._variational_dropout_config is not None:
       if self._variational_dropout_config.regularize_by_feature_complexity:
-        fscd = FSCDLayer(self._feature_configs, self._variational_dropout_config,
-                         is_training=self._is_training, name=group_name)
+        fscd = FSCDLayer(
+            self._feature_configs,
+            self._variational_dropout_config,
+            is_training=self._is_training,
+            name=group_name)
         output_features = fscd(cols_to_output_tensors)
         concat_features = array_ops.concat(
-          [output_features] + seq_features, axis=-1)
+            [output_features] + seq_features, axis=-1)
         group_features = [cols_to_output_tensors[x] for x in group_columns] + \
                          [cols_to_output_tensors[x] for x in group_seq_columns]
       else:
@@ -255,7 +261,8 @@ def single_call_input_layer(self,
                        [cols_to_output_tensors[x] for x in group_seq_columns]
 
     for fc, val in cols_to_output_tensors.items():
-      if isinstance(fc, EmbeddingColumn) or isinstance(fc, _SharedEmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
+      if isinstance(fc, EmbeddingColumn) or isinstance(
+          fc, _SharedEmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
         embedding_reg_lst.append(val)
 
     if embedding_reg_lst:
diff --git a/easy_rec/python/layers/multihead_cross_attention.py b/easy_rec/python/layers/multihead_cross_attention.py
index 511b2711d..f230ac974 100644
--- a/easy_rec/python/layers/multihead_cross_attention.py
+++ b/easy_rec/python/layers/multihead_cross_attention.py
@@ -708,11 +708,12 @@ def embedding_postprocessor(input_tensor,
   if use_position_embeddings:
     assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
     with tf.control_dependencies([assert_op]):
-      with tf.variable_scope("position_embedding", reuse=reuse_position_embedding):
+      with tf.variable_scope(
+          'position_embedding', reuse=reuse_position_embedding):
         full_position_embeddings = tf.get_variable(
-          name=position_embedding_name,
-          shape=[max_position_embeddings, width],
-          initializer=create_initializer(initializer_range))
+            name=position_embedding_name,
+            shape=[max_position_embeddings, width],
+            initializer=create_initializer(initializer_range))
       # Since the position embedding table is a learned variable, we create it
       # using a (long) sequence length `max_position_embeddings`. The actual
       # sequence length might be shorter than this, for faster training of
diff --git a/easy_rec/python/loss/nce_loss.py b/easy_rec/python/loss/nce_loss.py
index 7613384ab..f2e406d20 100644
--- a/easy_rec/python/loss/nce_loss.py
+++ b/easy_rec/python/loss/nce_loss.py
@@ -2,33 +2,38 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import tensorflow as tf
+
 from easy_rec.python.utils.shape_utils import get_shape_list
 
 
 def mask_samples(batch_size):
-    part = tf.ones((batch_size, batch_size), bool)
-    diag_part = tf.linalg.diag_part(part)
-    diag_part = tf.fill(tf.shape(diag_part), False)
-    part = tf.linalg.set_diag(part, diag_part)
-    part_half = tf.concat([part, part], axis=1)
-    part_total = tf.concat([part_half, part_half], axis=0)
-    return part_total
+  part = tf.ones((batch_size, batch_size), bool)
+  diag_part = tf.linalg.diag_part(part)
+  diag_part = tf.fill(tf.shape(diag_part), False)
+  part = tf.linalg.set_diag(part, diag_part)
+  part_half = tf.concat([part, part], axis=1)
+  part_total = tf.concat([part_half, part_half], axis=0)
+  return part_total
 
 
 def nce_loss(z_i, z_j, temp=1):
-    batch_size = get_shape_list(z_i)[0]
-    N = 2 * batch_size
-    z = tf.concat((z_i, z_j), axis=0)
-    sim = tf.matmul(z, tf.transpose(z)) / temp
-    sim_i_j = tf.matrix_diag_part(tf.slice(sim, [batch_size, 0], [batch_size, batch_size]))
-    sim_j_i = tf.matrix_diag_part(tf.slice(sim, [0, batch_size], [batch_size, batch_size]))
-    positive_samples = tf.reshape(tf.concat((sim_i_j, sim_j_i), axis=0), (N, 1))
-    mask = mask_samples(batch_size)
-    negative_samples = tf.reshape(tf.boolean_mask(sim, mask), (N, -1))
-
-    labels = tf.zeros(N, dtype=tf.int32)
-    logits = tf.concat((positive_samples, negative_samples), axis=1)
-
-    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits))
-
-    return loss
+  batch_size = get_shape_list(z_i)[0]
+  N = 2 * batch_size
+  z = tf.concat((z_i, z_j), axis=0)
+  sim = tf.matmul(z, tf.transpose(z)) / temp
+  sim_i_j = tf.matrix_diag_part(
+      tf.slice(sim, [batch_size, 0], [batch_size, batch_size]))
+  sim_j_i = tf.matrix_diag_part(
+      tf.slice(sim, [0, batch_size], [batch_size, batch_size]))
+  positive_samples = tf.reshape(tf.concat((sim_i_j, sim_j_i), axis=0), (N, 1))
+  mask = mask_samples(batch_size)
+  negative_samples = tf.reshape(tf.boolean_mask(sim, mask), (N, -1))
+
+  labels = tf.zeros(N, dtype=tf.int32)
+  logits = tf.concat((positive_samples, negative_samples), axis=1)
+
+  loss = tf.reduce_mean(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          labels=labels, logits=logits))
+
+  return loss
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index 871306326..6483877b7 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -11,13 +11,13 @@
 from tensorflow.python.ops.variables import PartitionedVariable
 
 from easy_rec.python.compat import regularizers
+from easy_rec.python.layers import dnn
 from easy_rec.python.layers import input_layer
 from easy_rec.python.layers.sequence_encoder import SequenceEncoder
 from easy_rec.python.utils import constant
 from easy_rec.python.utils import estimator_utils
 from easy_rec.python.utils import restore_filter
 from easy_rec.python.utils.load_class import get_register_class_meta
-from easy_rec.python.layers import dnn
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -110,8 +110,11 @@ def get_sequence_encoding(self, group_name=None, is_training=True):
     if group_name is not None:
       if group_name in self._sequence_encoding_by_group_name:
         return self._sequence_encoding_by_group_name[group_name]
-      encoding = self._sequence_encoder(self._feature_dict, group_name,
-                                        is_training, loss_dict=self._loss_dict)
+      encoding = self._sequence_encoder(
+          self._feature_dict,
+          group_name,
+          is_training,
+          loss_dict=self._loss_dict)
       self._sequence_encoding_by_group_name[group_name] = encoding
       return encoding
 
@@ -123,8 +126,11 @@ def get_sequence_encoding(self, group_name=None, is_training=True):
       if group_name in self._sequence_encoding_by_group_name:
         encoding = self._sequence_encoding_by_group_name[group_name]
       else:
-        encoding = self._sequence_encoder(self._feature_dict, group_name,
-                                          is_training, loss_dict=self._loss_dict)
+        encoding = self._sequence_encoder(
+            self._feature_dict,
+            group_name,
+            is_training,
+            loss_dict=self._loss_dict)
         self._sequence_encoding_by_group_name[group_name] = encoding
       if encoding is not None:
         seq_encoding.append(encoding)
@@ -138,10 +144,10 @@ def get_sequence_encoding(self, group_name=None, is_training=True):
 
     if self._base_model_config.HasField('sequence_dnn'):
       sequence_dnn = dnn.DNN(
-        self._base_model_config.sequence_dnn,
-        self._l2_reg,
-        name='sequence_dnn',
-        is_training=self._is_training)
+          self._base_model_config.sequence_dnn,
+          self._l2_reg,
+          name='sequence_dnn',
+          is_training=self._is_training)
       encoding = sequence_dnn(encoding)
     return encoding
 
diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto
index b642fff23..17e501361 100644
--- a/easy_rec/python/protos/feature_config.proto
+++ b/easy_rec/python/protos/feature_config.proto
@@ -43,6 +43,7 @@ message FeatureConfig {
         LookupFeature = 4;
         SequenceFeature = 5;
         ExprFeature = 6;
+        ConstFeature = 7;
     }
 
     enum FieldType {
diff --git a/easy_rec/python/protos/variational_dropout.proto b/easy_rec/python/protos/variational_dropout.proto
index afe4d061c..c643b3d2e 100644
--- a/easy_rec/python/protos/variational_dropout.proto
+++ b/easy_rec/python/protos/variational_dropout.proto
@@ -2,15 +2,16 @@ syntax = "proto2";
 package protos;
 
 
-message  VariationalDropoutLayer{
+message VariationalDropoutLayer {
     // regularization coefficient lambda
     optional float regularization_lambda = 1 [default = 0.01];
     // variational_dropout dimension
     optional bool embedding_wise_variational_dropout = 2 [default = false];
+
     // whether to use FSCD model
     optional bool regularize_by_feature_complexity = 3 [default = false];
-
     optional float feature_complexity_weight = 4 [default = 1.0];
     optional float feature_dimension_weight = 5 [default = 1e-2];
     optional float feature_cardinality_weight = 6 [default = 1e-7];
+    optional uint32 fine_tune_use_top_k_features = 7;
 }
diff --git a/easy_rec/python/tools/explainer/deep_shap.py b/easy_rec/python/tools/explainer/deep_shap.py
index 4d0b72890..64508232f 100644
--- a/easy_rec/python/tools/explainer/deep_shap.py
+++ b/easy_rec/python/tools/explainer/deep_shap.py
@@ -1,17 +1,18 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import numpy as np
 import warnings
+
+import numpy as np
+import tensorflow as tf
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import gradients_impl as tf_gradients_impl
 
-if not hasattr(tf_gradients_impl, "_IsBackpropagatable"):
+if not hasattr(tf_gradients_impl, '_IsBackpropagatable'):
   from tensorflow.python.ops import gradients_util as tf_gradients_impl
-import tensorflow as tf
 
 
 class DeepShap(object):
-  """ Meant to approximate SHAP values for deep learning models.
+  """Meant to approximate SHAP values for deep learning models.
 
   This is an enhanced version of the DeepLIFT algorithm (Deep SHAP) where, similar to Kernel SHAP, we
   approximate the conditional expectations of SHAP values using a selection of background samples.
@@ -22,8 +23,13 @@ class DeepShap(object):
   current model output (f(x) - E[f(x)]).
   """
 
-  def __init__(self, inputs, output, data, session=None, learning_phase_flags=None):
-    """ An explainer object for a deep model using a given background dataset.
+  def __init__(self,
+               inputs,
+               output,
+               data,
+               session=None,
+               learning_phase_flags=None):
+    """An explainer object for a deep model using a given background dataset.
 
     Note that the complexity of the method scales linearly with the number of background data
     samples. Passing the entire training dataset as `data` will give very accurate expected
@@ -56,12 +62,14 @@ def __init__(self, inputs, output, data, session=None, learning_phase_flags=None
         batch norm or dropout. If None is passed then we look for tensors in the graph that look like
         learning phase flags. Note that we assume all the flags should
         have a value of False during predictions (and hence explanations).
-
     """
     self.model_inputs = inputs
     self.model_output = output
-    assert type(self.model_output) != list, "The model output to be explained must be a single tensor!"
-    assert len(self.model_output.shape) < 3, "The model output must be a vector or a single value!"
+    assert type(
+        self.model_output
+    ) != list, 'The model output to be explained must be a single tensor!'
+    assert len(self.model_output.shape
+               ) < 3, 'The model output must be a vector or a single value!'
     self.multi_output = True
     if len(self.model_output.shape) == 1:
       self.multi_output = False
@@ -76,7 +84,8 @@ def __init__(self, inputs, output, data, session=None, learning_phase_flags=None
       data = [data]
     self.data = data
 
-    self._vinputs = {}  # used to track what op inputs depends on the model inputs
+    self._vinputs = {
+    }  # used to track what op inputs depends on the model inputs
     self.orig_grads = {}
 
     if session is None:
@@ -93,10 +102,13 @@ def __init__(self, inputs, output, data, session=None, learning_phase_flags=None
     if learning_phase_flags is None:
       self.learning_phase_ops = []
       for op in self.graph.get_operations():
-        if 'learning_phase' in op.name and op.type == "Const" and len(op.outputs[0].shape) == 0:
+        if 'learning_phase' in op.name and op.type == 'Const' and len(
+            op.outputs[0].shape) == 0:
           if op.outputs[0].dtype == tf.bool:
             self.learning_phase_ops.append(op)
-      self.learning_phase_flags = [op.outputs[0] for op in self.learning_phase_ops]
+      self.learning_phase_flags = [
+          op.outputs[0] for op in self.learning_phase_ops
+      ]
     else:
       self.learning_phase_ops = [t.op for t in learning_phase_flags]
 
@@ -107,8 +119,10 @@ def __init__(self, inputs, output, data, session=None, learning_phase_flags=None
     else:
       if self.data[0].shape[0] > 5000:
         warnings.warn(
-          "You have provided over 5k background samples! For better performance consider using smaller random sample.")
-      self.expected_value = self.run(self.model_output, self.model_inputs, self.data).mean(0)
+            'You have provided over 5k background samples! For better performance consider using smaller random sample.'
+        )
+      self.expected_value = self.run(self.model_output, self.model_inputs,
+                                     self.data).mean(0)
 
     self._init_between_tensors(self.model_output.op, self.model_inputs)
 
@@ -122,22 +136,24 @@ def __init__(self, inputs, output, data, session=None, learning_phase_flags=None
       if noutputs is not None:
         self.phi_symbolics = [None for i in range(noutputs)]
       else:
-        raise Exception("The model output tensor to be explained cannot have a static shape in dim 1 of None!")
+        raise Exception(
+            'The model output tensor to be explained cannot have a static shape in dim 1 of None!'
+        )
 
   def run(self, out, model_inputs, X):
-    """ Runs the model while also setting the learning phase flags to False.
-    """
+    """Runs the model while also setting the learning phase flags to False."""
     feed_dict = dict(zip(model_inputs, X))
     for t in self.learning_phase_flags:
       feed_dict[t] = False
     return self.session.run(out, feed_dict)
 
   def phi_symbolic(self, i):
-    """ Get the SHAP value computation graph for a given model output.
-        """
+    """Get the SHAP value computation graph for a given model output."""
     if self.phi_symbolics[i] is None:
+
       def anon():
-        out = self.model_output[:, i] if self.multi_output else self.model_output
+        out = self.model_output[:,
+                                i] if self.multi_output else self.model_output
         return tf.gradients(out, self.model_inputs)
 
       self.phi_symbolics[i] = self.execute_with_overridden_gradients(anon)
@@ -145,10 +161,10 @@ def anon():
     return self.phi_symbolics[i]
 
   def custom_grad(self, op, *grads):
-    """ Passes a gradient op creation request to the correct handler.
-    """
-    type_name = op.type[5:] if op.type.startswith("shap_") else op.type
-    out = op_handlers[type_name](self, op, *grads)  # we cut off the shap_ prefex before the lookup
+    """Passes a gradient op creation request to the correct handler."""
+    type_name = op.type[5:] if op.type.startswith('shap_') else op.type
+    out = op_handlers[type_name](
+        self, op, *grads)  # we cut off the shap_ prefex before the lookup
     return out
 
   def execute_with_overridden_gradients(self, f):
@@ -157,22 +173,22 @@ def execute_with_overridden_gradients(self, f):
     reg = tf_ops._gradient_registry._registry
     ops_not_in_registry = ['TensorListReserve']
     # NOTE: location_tag taken from tensorflow source for None type ops
-    location_tag = ("UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN")
+    location_tag = ('UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN')
     # TODO: unclear why some ops are not in the registry with TF 2.0 like TensorListReserve
     for non_reg_ops in ops_not_in_registry:
       reg[non_reg_ops] = {'type': None, 'location': location_tag}
     for n in op_handlers:
       if n in reg:
-        self.orig_grads[n] = reg[n]["type"]
-        reg["shap_" + n] = {
-          "type": self.custom_grad,
-          "location": reg[n]["location"]
+        self.orig_grads[n] = reg[n]['type']
+        reg['shap_' + n] = {
+            'type': self.custom_grad,
+            'location': reg[n]['location']
         }
-        reg[n]["type"] = self.custom_grad
+        reg[n]['type'] = self.custom_grad
 
     # In TensorFlow 1.10 they started pruning out nodes that they think can't be backpropped
     # unfortunately that includes the index of embedding layers so we disable that check here
-    if hasattr(tf_gradients_impl, "_IsBackpropagatable"):
+    if hasattr(tf_gradients_impl, '_IsBackpropagatable'):
       orig_IsBackpropagatable = tf_gradients_impl._IsBackpropagatable
       tf_gradients_impl._IsBackpropagatable = lambda tensor: True
 
@@ -181,20 +197,24 @@ def execute_with_overridden_gradients(self, f):
       out = f()
     finally:
       # reinstate the backpropagatable check
-      if hasattr(tf_gradients_impl, "_IsBackpropagatable"):
+      if hasattr(tf_gradients_impl, '_IsBackpropagatable'):
         tf_gradients_impl._IsBackpropagatable = orig_IsBackpropagatable
 
       # restore the original gradient definitions
       for n in op_handlers:
         if n in reg:
-          del reg["shap_" + n]
-          reg[n]["type"] = self.orig_grads[n]
+          del reg['shap_' + n]
+          reg[n]['type'] = self.orig_grads[n]
       for non_reg_ops in ops_not_in_registry:
         del reg[non_reg_ops]
     return out
 
-  def shap_values(self, X, ranked_outputs=None, output_rank_order="max", check_additivity=True):
-    """ Return approximate SHAP values for the model applied to the data given by X.
+  def shap_values(self,
+                  X,
+                  ranked_outputs=None,
+                  output_rank_order='max',
+                  check_additivity=True):
+    """Return approximate SHAP values for the model applied to the data given by X.
 
     Parameters
     ----------
@@ -228,29 +248,32 @@ def shap_values(self, X, ranked_outputs=None, output_rank_order="max", check_add
     # check if we have multiple inputs
     if not self.multi_input:
       if type(X) == list and len(X) != 1:
-        assert False, "Expected a single tensor as model input!"
+        assert False, 'Expected a single tensor as model input!'
       elif type(X) != list:
         X = [X]
     else:
-      assert type(X) == list, "Expected a list of model inputs!"
-    assert len(self.model_inputs) == len(X), "Number of model inputs (%d) does not match the number given (%d)!" % (
-      len(self.model_inputs), len(X))
+      assert type(X) == list, 'Expected a list of model inputs!'
+    assert len(self.model_inputs) == len(
+        X
+    ), 'Number of model inputs (%d) does not match the number given (%d)!' % (
+        len(self.model_inputs), len(X))
 
     # rank and determine the model outputs that we will explain
     if ranked_outputs is not None and self.multi_output:
       model_output_values = self.run(self.model_output, self.model_inputs, X)
 
-      if output_rank_order == "max":
+      if output_rank_order == 'max':
         model_output_ranks = np.argsort(-model_output_values)
-      elif output_rank_order == "min":
+      elif output_rank_order == 'min':
         model_output_ranks = np.argsort(model_output_values)
-      elif output_rank_order == "max_abs":
+      elif output_rank_order == 'max_abs':
         model_output_ranks = np.argsort(np.abs(model_output_values))
       else:
-        assert False, "output_rank_order must be max, min, or max_abs!"
+        assert False, 'output_rank_order must be max, min, or max_abs!'
       model_output_ranks = model_output_ranks[:, :ranked_outputs]
     else:
-      model_output_ranks = np.tile(np.arange(len(self.phi_symbolics)), (X[0].shape[0], 1))
+      model_output_ranks = np.tile(
+          np.arange(len(self.phi_symbolics)), (X[0].shape[0], 1))
 
     # compute the attributions
     output_phis = []
@@ -267,19 +290,27 @@ def shap_values(self, X, ranked_outputs=None, output_rank_order="max", check_add
           bg_data = self.data
 
         # tile the inputs to line up with the background data samples
-        tiled_X = [np.tile(X[l][j:j + 1], (bg_data[l].shape[0],) + tuple([1 for k in range(len(X[l].shape) - 1)])) for l
-                   in range(len(X))]
+        tiled_X = [
+            np.tile(X[l][j:j + 1], (bg_data[l].shape[0],) +
+                    tuple([1
+                           for k in range(len(X[l].shape) - 1)]))
+            for l in range(len(X))
+        ]
 
         # we use the first sample for the current sample and the rest for the references
-        joint_input = [np.concatenate([tiled_X[l], bg_data[l]], 0) for l in range(len(X))]
+        joint_input = [
+            np.concatenate([tiled_X[l], bg_data[l]], 0) for l in range(len(X))
+        ]
 
         # run attribution computation graph
         feature_ind = model_output_ranks[j, i]
-        sample_phis = self.run(self.phi_symbolic(feature_ind), self.model_inputs, joint_input)
+        sample_phis = self.run(
+            self.phi_symbolic(feature_ind), self.model_inputs, joint_input)
 
         # assign the attributions to the right part of the output arrays
         for l in range(len(X)):
-          phis[l][j] = (sample_phis[l][bg_data[l].shape[0]:] * (X[l][j] - bg_data[l])).mean(0)
+          phis[l][j] = (sample_phis[l][bg_data[l].shape[0]:] *
+                        (X[l][j] - bg_data[l])).mean(0)
 
       output_phis.append(phis[0] if not self.multi_input else phis)
 
@@ -288,17 +319,19 @@ def shap_values(self, X, ranked_outputs=None, output_rank_order="max", check_add
       model_output = self.run(self.model_output, self.model_inputs, X)
       for l in range(len(self.expected_value)):
         if not self.multi_input:
-          diffs = model_output[:, l] - self.expected_value[l] - output_phis[l].sum(
-            axis=tuple(range(1, output_phis[l].ndim)))
+          diffs = model_output[:,
+                               l] - self.expected_value[l] - output_phis[l].sum(
+                                   axis=tuple(range(1, output_phis[l].ndim)))
         else:
           diffs = model_output[:, l] - self.expected_value[l]
           for i in range(len(output_phis[l])):
-            diffs -= output_phis[l][i].sum(axis=tuple(range(1, output_phis[l][i].ndim)))
+            diffs -= output_phis[l][i].sum(
+                axis=tuple(range(1, output_phis[l][i].ndim)))
         assert np.abs(
           diffs).max() < 1e-2, "The SHAP explanations do not sum up to the model's output! This is either because of a " \
-                               "rounding error or because an operator in your computation graph was not fully supported. If " \
-                               "the sum difference of %f is significant compared the scale of your model outputs please post " \
-                               "as a github issue, with a reproducible example if possible so we can debug it." % np.abs(
+                               'rounding error or because an operator in your computation graph was not fully supported. If ' \
+                               'the sum difference of %f is significant compared the scale of your model outputs please post ' \
+                               'as a github issue, with a reproducible example if possible so we can debug it.' % np.abs(
           diffs).max()
 
     if not self.multi_output:
@@ -310,21 +343,19 @@ def shap_values(self, X, ranked_outputs=None, output_rank_order="max", check_add
 
   def _init_between_tensors(self, out_op, model_inputs):
     # find all the operations in the graph between our inputs and outputs
-    tensor_blacklist = tensors_blocked_by_false(self.learning_phase_ops)  # don't follow learning phase branches
-    dependence_breakers = [k for k in op_handlers if op_handlers[k] == break_dependence]
-    back_ops = backward_walk_ops(
-      [out_op], tensor_blacklist,
-      dependence_breakers
-    )
+    tensor_blacklist = tensors_blocked_by_false(
+        self.learning_phase_ops)  # don't follow learning phase branches
+    dependence_breakers = [
+        k for k in op_handlers if op_handlers[k] == break_dependence
+    ]
+    back_ops = backward_walk_ops([out_op], tensor_blacklist,
+                                 dependence_breakers)
     start_ops = []
     for minput in model_inputs:
       for op in minput.consumers():
         start_ops.append(op)
     self.between_ops = forward_walk_ops(
-      start_ops,
-      tensor_blacklist, dependence_breakers,
-      within_ops=back_ops
-    )
+        start_ops, tensor_blacklist, dependence_breakers, within_ops=back_ops)
 
     # note all the tensors that are on the path between the inputs and the output
     self.between_tensors = {}
@@ -340,8 +371,7 @@ def _init_between_tensors(self, out_op, model_inputs):
       self.used_types[op.type] = True
 
   def _variable_inputs(self, op):
-    """ Return which inputs of this operation are variable (i.e. depend on the model inputs).
-    """
+    """Return which inputs of this operation are variable (i.e. depend on the model inputs)."""
     if op not in self._vinputs:
       out = np.zeros(len(op.inputs), dtype=np.bool)
       for i, t in enumerate(op.inputs):
@@ -351,7 +381,7 @@ def _variable_inputs(self, op):
 
 
 def tensors_blocked_by_false(ops):
-  """ Follows a set of ops assuming their value is False and find blocked Switch paths.
+  """Follows a set of ops assuming their value is False and find blocked Switch paths.
 
   This is used to prune away parts of the model graph that are only used during the training
   phase (like dropout, batch norm, etc.).
@@ -359,8 +389,10 @@ def tensors_blocked_by_false(ops):
   blocked = []
 
   def recurse(op):
-    if op.type == "Switch":
-      blocked.append(op.outputs[1])  # the true path is blocked since we assume the ops we trace are False
+    if op.type == 'Switch':
+      blocked.append(
+          op.outputs[1]
+      )  # the true path is blocked since we assume the ops we trace are False
     else:
       for out in op.outputs:
         for c in out.consumers():
@@ -385,7 +417,8 @@ def backward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist):
   return found_ops
 
 
-def forward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist, within_ops):
+def forward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist,
+                     within_ops):
   found_ops = []
   op_stack = [op for op in start_ops]
   while len(op_stack) > 0:
@@ -400,6 +433,7 @@ def forward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist, within_ops)
 
 
 def linearity_1d_nonlinearity_2d(input_ind0, input_ind1, op_func):
+
   def handler(explainer, op, *grads):
     var = explainer._variable_inputs(op)
     if var[input_ind0] and not var[input_ind1]:
@@ -407,14 +441,17 @@ def handler(explainer, op, *grads):
     elif var[input_ind1] and not var[input_ind0]:
       return linearity_1d_handler(input_ind1, explainer, op, *grads)
     elif var[input_ind0] and var[input_ind1]:
-      return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *grads)
+      return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer,
+                                     op, *grads)
     else:
-      return [None for _ in op.inputs]  # no inputs vary, we must be hidden by a switch function
+      return [None for _ in op.inputs
+              ]  # no inputs vary, we must be hidden by a switch function
 
   return handler
 
 
 def nonlinearity_1d_nonlinearity_2d(input_ind0, input_ind1, op_func):
+
   def handler(explainer, op, *grads):
     var = explainer._variable_inputs(op)
     if var[input_ind0] and not var[input_ind1]:
@@ -422,14 +459,17 @@ def handler(explainer, op, *grads):
     elif var[input_ind1] and not var[input_ind0]:
       return nonlinearity_1d_handler(input_ind1, explainer, op, *grads)
     elif var[input_ind0] and var[input_ind1]:
-      return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *grads)
+      return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer,
+                                     op, *grads)
     else:
-      return [None for _ in op.inputs]  # no inputs vary, we must be hidden by a switch function
+      return [None for _ in op.inputs
+              ]  # no inputs vary, we must be hidden by a switch function
 
   return handler
 
 
 def nonlinearity_1d(input_ind):
+
   def handler(explainer, op, *grads):
     return nonlinearity_1d_handler(input_ind, explainer, op, *grads)
 
@@ -444,7 +484,8 @@ def nonlinearity_1d_handler(input_ind, explainer, op, *grads):
 
   for i in range(len(op_inputs)):
     if i != input_ind:
-      assert not explainer._variable_inputs(op)[i], str(i) + "th input to " + op.name + " cannot vary!"
+      assert not explainer._variable_inputs(
+          op)[i], str(i) + 'th input to ' + op.name + ' cannot vary!'
 
   xin0, rin0 = tf.split(op_inputs[input_ind], 2)
   xout, rout = tf.split(op.outputs[input_ind], 2)
@@ -454,18 +495,18 @@ def nonlinearity_1d_handler(input_ind, explainer, op, *grads):
   else:
     dup0 = [2] + [1 for i in delta_in0.shape[1:]]
   out = [None for _ in op_inputs]
-  if op.type.startswith("shap_"):
+  if op.type.startswith('shap_'):
     op.type = op.type[5:]
   orig_grad = explainer.orig_grads[op.type](op, grads[0])
   out[input_ind] = tf.where(
-    tf.tile(tf.abs(delta_in0), dup0) < 1e-6,
-    orig_grad[input_ind] if len(op_inputs) > 1 else orig_grad,
-    grads[0] * tf.tile((xout - rout) / delta_in0, dup0)
-  )
+      tf.tile(tf.abs(delta_in0), dup0) < 1e-6,
+      orig_grad[input_ind] if len(op_inputs) > 1 else orig_grad,
+      grads[0] * tf.tile((xout - rout) / delta_in0, dup0))
   return out
 
 
-def nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *grads):
+def nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op,
+                            *grads):
   assert input_ind0 == 0 and input_ind1 == 1, "TODO: Can't yet handle double inputs that are not first!"
   xout, rout = tf.split(op.outputs[0], 2)
   in0 = op.inputs[input_ind0]
@@ -484,33 +525,37 @@ def nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *gra
   out1 = grads[0] * tf.tile(out1 / delta_in1, dup0)
 
   # Avoid divide by zero nans
-  out0 = tf.where(tf.abs(tf.tile(delta_in0, dup0)) < 1e-7, tf.zeros_like(out0), out0)
-  out1 = tf.where(tf.abs(tf.tile(delta_in1, dup0)) < 1e-7, tf.zeros_like(out1), out1)
+  out0 = tf.where(
+      tf.abs(tf.tile(delta_in0, dup0)) < 1e-7, tf.zeros_like(out0), out0)
+  out1 = tf.where(
+      tf.abs(tf.tile(delta_in1, dup0)) < 1e-7, tf.zeros_like(out1), out1)
 
   # see if due to broadcasting our gradient shapes don't match our input shapes
   if (np.any(np.array(out1.shape) != np.array(in1.shape))):
-    broadcast_index = np.where(np.array(out1.shape) != np.array(in1.shape))[0][0]
+    broadcast_index = np.where(
+        np.array(out1.shape) != np.array(in1.shape))[0][0]
     out1 = tf.reduce_sum(out1, axis=broadcast_index, keepdims=True)
   elif (np.any(np.array(out0.shape) != np.array(in0.shape))):
-    broadcast_index = np.where(np.array(out0.shape) != np.array(in0.shape))[0][0]
+    broadcast_index = np.where(
+        np.array(out0.shape) != np.array(in0.shape))[0][0]
     out0 = tf.reduce_sum(out0, axis=broadcast_index, keepdims=True)
 
   return [out0, out1]
 
 
 def softmax(explainer, op, *grads):
-  """ Just decompose softmax into its components and recurse, we can handle all of them :)
+  """Just decompose softmax into its components and recurse, we can handle all of them :)
 
-    We assume the 'axis' is the last dimension because the TF codebase swaps the 'axis' to
-    the last dimension before the softmax op if 'axis' is not already the last dimension.
-    We also don't subtract the max before tf.exp for numerical stability since that might
-    mess up the attributions and it seems like TensorFlow doesn't define softmax that way
-    (according to the docs)
-    """
+  We assume the 'axis' is the last dimension because the TF codebase swaps the 'axis' to
+  the last dimension before the softmax op if 'axis' is not already the last dimension.
+  We also don't subtract the max before tf.exp for numerical stability since that might
+  mess up the attributions and it seems like TensorFlow doesn't define softmax that way
+  (according to the docs)
+  """
   in0 = op.inputs[0]
-  in0_max = tf.reduce_max(in0, axis=-1, keepdims=True, name="in0_max")
+  in0_max = tf.reduce_max(in0, axis=-1, keepdims=True, name='in0_max')
   in0_centered = in0 - in0_max
-  evals = tf.exp(in0_centered, name="custom_exp")
+  evals = tf.exp(in0_centered, name='custom_exp')
   rsum = tf.reduce_sum(evals, axis=-1, keepdims=True)
   div = evals / rsum
 
@@ -534,10 +579,8 @@ def softmax(explainer, op, *grads):
   delta_in0 = xin0 - rin0
   dup0 = [2] + [1 for i in delta_in0.shape[1:]]
   return tf.where(
-    tf.tile(tf.abs(delta_in0), dup0) < 1e-6,
-    out,
-    out * tf.tile((xin0_centered - rin0_centered) / delta_in0, dup0)
-  )
+      tf.tile(tf.abs(delta_in0), dup0) < 1e-6, out,
+      out * tf.tile((xin0_centered - rin0_centered) / delta_in0, dup0))
 
 
 def maxpool(explainer, op, *grads):
@@ -547,14 +590,14 @@ def maxpool(explainer, op, *grads):
   dup0 = [2] + [1 for i in delta_in0.shape[1:]]
   cross_max = tf.maximum(xout, rout)
   diffs = tf.concat([cross_max - rout, xout - cross_max], 0)
-  if op.type.startswith("shap_"):
+  if op.type.startswith('shap_'):
     op.type = op.type[5:]
-  xmax_pos, rmax_pos = tf.split(explainer.orig_grads[op.type](op, grads[0] * diffs), 2)
-  return tf.tile(tf.where(
-    tf.abs(delta_in0) < 1e-7,
-    tf.zeros_like(delta_in0),
-    (xmax_pos + rmax_pos) / delta_in0
-  ), dup0)
+  xmax_pos, rmax_pos = tf.split(
+      explainer.orig_grads[op.type](op, grads[0] * diffs), 2)
+  return tf.tile(
+      tf.where(
+          tf.abs(delta_in0) < 1e-7, tf.zeros_like(delta_in0),
+          (xmax_pos + rmax_pos) / delta_in0), dup0)
 
 
 def gather(explainer, op, *grads):
@@ -563,35 +606,41 @@ def gather(explainer, op, *grads):
   # axis = op.inputs[2]
   var = explainer._variable_inputs(op)
   if var[1] and not var[0]:
-    assert len(indices.shape) == 2, "Only scalar indices supported right now in GatherV2!"
+    assert len(indices.shape
+               ) == 2, 'Only scalar indices supported right now in GatherV2!'
 
     xin1, rin1 = tf.split(tf.cast(op.inputs[1], tf.float32), 2)
     xout, rout = tf.split(op.outputs[0], 2)
     dup_in1 = [2] + [1 for i in xin1.shape[1:]]
     dup_out = [2] + [1 for i in xout.shape[1:]]
     delta_in1_t = tf.tile(xin1 - rin1, dup_in1)
-    out_sum = tf.reduce_sum(grads[0] * tf.tile(xout - rout, dup_out),
-                            list(range(len(indices.shape), len(grads[0].shape))))
-    if op.type == "ResourceGather":
-      return [None, tf.where(
-        tf.abs(delta_in1_t) < 1e-6,
-        tf.zeros_like(delta_in1_t),
-        out_sum / delta_in1_t
-      )]
-    return [None, tf.where(
-      tf.abs(delta_in1_t) < 1e-6,
-      tf.zeros_like(delta_in1_t),
-      out_sum / delta_in1_t
-    ), None]
+    out_sum = tf.reduce_sum(
+        grads[0] * tf.tile(xout - rout, dup_out),
+        list(range(len(indices.shape), len(grads[0].shape))))
+    if op.type == 'ResourceGather':
+      return [
+          None,
+          tf.where(
+              tf.abs(delta_in1_t) < 1e-6, tf.zeros_like(delta_in1_t),
+              out_sum / delta_in1_t)
+      ]
+    return [
+        None,
+        tf.where(
+            tf.abs(delta_in1_t) < 1e-6, tf.zeros_like(delta_in1_t),
+            out_sum / delta_in1_t), None
+    ]
   elif var[0] and not var[1]:
-    if op.type.startswith("shap_"):
+    if op.type.startswith('shap_'):
       op.type = op.type[5:]
-    return [explainer.orig_grads[op.type](op, grads[0]), None]  # linear in this case
+    return [explainer.orig_grads[op.type](op, grads[0]),
+            None]  # linear in this case
   else:
-    assert False, "Axis not yet supported to be varying for gather op!"
+    assert False, 'Axis not yet supported to be varying for gather op!'
 
 
 def linearity_1d(input_ind):
+
   def handler(explainer, op, *grads):
     return linearity_1d_handler(input_ind, explainer, op, *grads)
 
@@ -602,13 +651,15 @@ def linearity_1d_handler(input_ind, explainer, op, *grads):
   # make sure only the given input varies (negative means only that input cannot vary, and is measured from the end of the list)
   for i in range(len(op.inputs)):
     if i != input_ind:
-      assert not explainer._variable_inputs(op)[i], str(i) + "th input to " + op.name + " cannot vary!"
-  if op.type.startswith("shap_"):
+      assert not explainer._variable_inputs(
+          op)[i], str(i) + 'th input to ' + op.name + ' cannot vary!'
+  if op.type.startswith('shap_'):
     op.type = op.type[5:]
   return explainer.orig_grads[op.type](op, *grads)
 
 
 def linearity_with_excluded(input_inds):
+
   def handler(explainer, op, *grads):
     return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
 
@@ -619,20 +670,21 @@ def linearity_with_excluded_handler(input_inds, explainer, op, *grads):
   # make sure the given inputs don't vary (negative is measured from the end of the list)
   for i in range(len(op.inputs)):
     if i in input_inds or i - len(op.inputs) in input_inds:
-      assert not explainer._variable_inputs(op)[i], str(i) + "th input to " + op.name + " cannot vary!"
-  if op.type.startswith("shap_"):
+      assert not explainer._variable_inputs(
+          op)[i], str(i) + 'th input to ' + op.name + ' cannot vary!'
+  if op.type.startswith('shap_'):
     op.type = op.type[5:]
   return explainer.orig_grads[op.type](op, *grads)
 
 
 def passthrough(explainer, op, *grads):
-  if op.type.startswith("shap_"):
+  if op.type.startswith('shap_'):
     op.type = op.type[5:]
   return explainer.orig_grads[op.type](op, *grads)
 
 
 def break_dependence(explainer, op, *grads):
-  """ This function name is used to break attribution dependence in the graph traversal.
+  """This function name is used to break attribution dependence in the graph traversal.
 
   These operation types may be connected above input data values in the graph but their outputs
   don't depend on the input values (for example they just depend on the shape).
@@ -643,68 +695,72 @@ def break_dependence(explainer, op, *grads):
 op_handlers = {}
 
 # ops that are always linear
-op_handlers["Identity"] = passthrough
-op_handlers["StridedSlice"] = passthrough
-op_handlers["Squeeze"] = passthrough
-op_handlers["ExpandDims"] = passthrough
-op_handlers["Pack"] = passthrough
-op_handlers["BiasAdd"] = passthrough
-op_handlers["Unpack"] = passthrough
-op_handlers["Add"] = passthrough
-op_handlers["Sub"] = passthrough
-op_handlers["Merge"] = passthrough
-op_handlers["Sum"] = passthrough
-op_handlers["Mean"] = passthrough
-op_handlers["Cast"] = passthrough
-op_handlers["Transpose"] = passthrough
-op_handlers["Enter"] = passthrough
-op_handlers["Exit"] = passthrough
-op_handlers["NextIteration"] = passthrough
-op_handlers["Tile"] = passthrough
-op_handlers["TensorArrayScatterV3"] = passthrough
-op_handlers["TensorArrayReadV3"] = passthrough
-op_handlers["TensorArrayWriteV3"] = passthrough
+op_handlers['Identity'] = passthrough
+op_handlers['StridedSlice'] = passthrough
+op_handlers['Squeeze'] = passthrough
+op_handlers['ExpandDims'] = passthrough
+op_handlers['Pack'] = passthrough
+op_handlers['BiasAdd'] = passthrough
+op_handlers['Unpack'] = passthrough
+op_handlers['Add'] = passthrough
+op_handlers['Sub'] = passthrough
+op_handlers['Merge'] = passthrough
+op_handlers['Sum'] = passthrough
+op_handlers['Mean'] = passthrough
+op_handlers['Cast'] = passthrough
+op_handlers['Transpose'] = passthrough
+op_handlers['Enter'] = passthrough
+op_handlers['Exit'] = passthrough
+op_handlers['NextIteration'] = passthrough
+op_handlers['Tile'] = passthrough
+op_handlers['TensorArrayScatterV3'] = passthrough
+op_handlers['TensorArrayReadV3'] = passthrough
+op_handlers['TensorArrayWriteV3'] = passthrough
 
 # ops that don't pass any attributions to their inputs
-op_handlers["Shape"] = break_dependence
-op_handlers["RandomUniform"] = break_dependence
-op_handlers["ZerosLike"] = break_dependence
+op_handlers['Shape'] = break_dependence
+op_handlers['RandomUniform'] = break_dependence
+op_handlers['ZerosLike'] = break_dependence
 # op_handlers["StopGradient"] = break_dependence # this allows us to stop attributions when we want to (like softmax re-centering)
 
 # ops that are linear and only allow a single input to vary
-op_handlers["Reshape"] = linearity_1d(0)
-op_handlers["Pad"] = linearity_1d(0)
-op_handlers["ReverseV2"] = linearity_1d(0)
-op_handlers["ConcatV2"] = linearity_with_excluded([-1])
-op_handlers["Conv2D"] = linearity_1d(0)
-op_handlers["Switch"] = linearity_1d(0)
-op_handlers["AvgPool"] = linearity_1d(0)
-op_handlers["FusedBatchNorm"] = linearity_1d(0)
+op_handlers['Reshape'] = linearity_1d(0)
+op_handlers['Pad'] = linearity_1d(0)
+op_handlers['ReverseV2'] = linearity_1d(0)
+op_handlers['ConcatV2'] = linearity_with_excluded([-1])
+op_handlers['Conv2D'] = linearity_1d(0)
+op_handlers['Switch'] = linearity_1d(0)
+op_handlers['AvgPool'] = linearity_1d(0)
+op_handlers['FusedBatchNorm'] = linearity_1d(0)
 
 # ops that are nonlinear and only allow a single input to vary
-op_handlers["Relu"] = nonlinearity_1d(0)
-op_handlers["Elu"] = nonlinearity_1d(0)
-op_handlers["Sigmoid"] = nonlinearity_1d(0)
-op_handlers["Tanh"] = nonlinearity_1d(0)
-op_handlers["Softplus"] = nonlinearity_1d(0)
-op_handlers["Exp"] = nonlinearity_1d(0)
-op_handlers["ClipByValue"] = nonlinearity_1d(0)
-op_handlers["Rsqrt"] = nonlinearity_1d(0)
-op_handlers["Square"] = nonlinearity_1d(0)
-op_handlers["Max"] = nonlinearity_1d(0)
+op_handlers['Relu'] = nonlinearity_1d(0)
+op_handlers['Elu'] = nonlinearity_1d(0)
+op_handlers['Sigmoid'] = nonlinearity_1d(0)
+op_handlers['Tanh'] = nonlinearity_1d(0)
+op_handlers['Softplus'] = nonlinearity_1d(0)
+op_handlers['Exp'] = nonlinearity_1d(0)
+op_handlers['ClipByValue'] = nonlinearity_1d(0)
+op_handlers['Rsqrt'] = nonlinearity_1d(0)
+op_handlers['Square'] = nonlinearity_1d(0)
+op_handlers['Max'] = nonlinearity_1d(0)
 
 # ops that are nonlinear and allow two inputs to vary
-op_handlers["SquaredDifference"] = nonlinearity_1d_nonlinearity_2d(0, 1, lambda x, y: (x - y) * (x - y))
-op_handlers["Minimum"] = nonlinearity_1d_nonlinearity_2d(0, 1, lambda x, y: tf.minimum(x, y))
-op_handlers["Maximum"] = nonlinearity_1d_nonlinearity_2d(0, 1, lambda x, y: tf.maximum(x, y))
+op_handlers['SquaredDifference'] = nonlinearity_1d_nonlinearity_2d(
+    0, 1, lambda x, y: (x - y) * (x - y))
+op_handlers['Minimum'] = nonlinearity_1d_nonlinearity_2d(
+    0, 1, lambda x, y: tf.minimum(x, y))
+op_handlers['Maximum'] = nonlinearity_1d_nonlinearity_2d(
+    0, 1, lambda x, y: tf.maximum(x, y))
 
 # ops that allow up to two inputs to vary are are linear when only one input varies
-op_handlers["Mul"] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x * y)
-op_handlers["RealDiv"] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x / y)
-op_handlers["MatMul"] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: tf.matmul(x, y))
+op_handlers['Mul'] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x * y)
+op_handlers['RealDiv'] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x / y)
+op_handlers['MatMul'] = linearity_1d_nonlinearity_2d(
+    0, 1, lambda x, y: tf.matmul(x, y))
 
 # ops that need their own custom attribution functions
-op_handlers["GatherV2"] = gather
-op_handlers["ResourceGather"] = gather
-op_handlers["MaxPool"] = maxpool
-op_handlers["Softmax"] = softmax
+op_handlers['GatherV2'] = gather
+op_handlers['ResourceGather'] = gather
+op_handlers['MaxPool'] = maxpool
+op_handlers['Softmax'] = softmax
diff --git a/easy_rec/python/tools/explainer/explainer.py b/easy_rec/python/tools/explainer/explainer.py
index a40784458..04d2bc4dc 100644
--- a/easy_rec/python/tools/explainer/explainer.py
+++ b/easy_rec/python/tools/explainer/explainer.py
@@ -1,24 +1,26 @@
-import tensorflow as tf
-from tensorflow.python.platform import gfile
-from tensorflow.python.saved_model import signature_constants
-from easy_rec.python.utils.load_class import get_register_class_meta
-from easy_rec.python.utils.config_util import get_configs_from_pipeline_file
-from easy_rec.python.utils.input_utils import get_type_defaults
-from easy_rec.python.tools.explainer.methods import DeepExplain
-# from easy_rec.python.tools.explainer.deep_shap import DeepShap
-from easy_rec.python.protos.dataset_pb2 import DatasetConfig
 import abc
 import collections
-import numpy as np
 import logging
-import six
+import os
 import time
+
+import numpy as np
+import six
+import tensorflow as tf
 from six import moves
-import os
+from tensorflow.python.platform import gfile
+from tensorflow.python.saved_model import signature_constants
+
+# from easy_rec.python.tools.explainer.deep_shap import DeepShap
+from easy_rec.python.protos.dataset_pb2 import DatasetConfig
+from easy_rec.python.tools.explainer.methods import DeepExplain
+from easy_rec.python.utils.config_util import get_configs_from_pipeline_file
+from easy_rec.python.utils.input_utils import get_type_defaults
+from easy_rec.python.utils.load_class import get_register_class_meta
 
 _EXPLAINER_CLASS_MAP = {}
 _register_abc_meta = get_register_class_meta(
-  _EXPLAINER_CLASS_MAP, have_abstract_class=True)
+    _EXPLAINER_CLASS_MAP, have_abstract_class=True)
 
 
 class Explainer(six.with_metaclass(_register_abc_meta, object)):
@@ -48,17 +50,18 @@ def _build_model(self):
       assert tf.saved_model.loader.maybe_saved_model_directory(model_path), \
         'saved model does not exists in %s' % model_path
     else:
-      raise ValueError('currently only savedmodel is supported, path:' + model_path)
+      raise ValueError('currently only savedmodel is supported, path:' +
+                       model_path)
 
     input_fields = _get_input_fields_from_pipeline_config(model_path)
     self._input_fields_info, self._input_fields = input_fields
 
     de = self.deep_explain
     meta_graph_def = tf.saved_model.loader.load(
-      de.session, [tf.saved_model.tag_constants.SERVING], model_path)
+        de.session, [tf.saved_model.tag_constants.SERVING], model_path)
     # parse signature
     signature_def = meta_graph_def.signature_def[
-      signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
     inputs = signature_def.inputs
     input_info = []
     self._is_multi_placeholder = len(inputs.items()) > 1
@@ -76,8 +79,8 @@ def _build_model(self):
           # in which case, the order of inputs may not be the
           # same as they are defined, therefore, list input
           # could not be supported, only dict input could be supported
-          logging.warning(
-            'could not determine input_id from input_name: %s' % input_name)
+          logging.warning('could not determine input_id from input_name: %s' %
+                          input_name)
           input_id = gid
         input_info.append((input_id, name, tensor.dtype))
         self._inputs_map[name] = de.graph.get_tensor_by_name(tensor.name)
@@ -120,7 +123,8 @@ def default_values(self):
 
     default_value = []
     for i, (field, name) in enumerate(zip(input_fields, self._input_names)):
-      assert field == name, "input field `%d` has different names: <%s, %s>" % (i, field, name)
+      assert field == name, 'input field `%d` has different names: <%s, %s>' % (
+          i, field, name)
       value = self._get_defaults(field)
       # default_value.append(np.array([value]))  # for deep_shap
       default_value.append(np.array(value))  # for deep_shap
@@ -134,20 +138,21 @@ def _get_defaults(self, col_name, col_type='string'):
     else:
       defaults = {'string': '', 'double': 0.0, 'bigint': 0}
       assert col_type in defaults, 'invalid col_type: %s, col_type: %s' % (
-        col_name, col_type)
+          col_name, col_type)
       default_val = defaults[col_type]
       logging.info(
-        'col_name: %s, default_val: %s.[not defined in saved_model_dir/assets/pipeline.config]'
-        % (col_name, default_val))
+          'col_name: %s, default_val: %s.[not defined in saved_model_dir/assets/pipeline.config]'
+          % (col_name, default_val))
     return default_val
 
   def str_to_number(self, values):
-    assert len(values) == len(self._input_fields), "value count %d is not equal to the number of input fields %d" % (
-      len(values), len(self._input_fields)
-    )
+    assert len(values) == len(
+        self._input_fields
+    ), 'value count %d is not equal to the number of input fields %d' % (
+        len(values), len(self._input_fields))
     result = []
     for i, name in enumerate(self._input_names):
-      assert name in self._input_fields_info, "input `%s` not in pipeline config" % name
+      assert name in self._input_fields_info, 'input `%s` not in pipeline config' % name
       idx = self._input_fields.index(name)
       input_type, default_val = self._input_fields_info[name]
       if input_type in {DatasetConfig.INT32, DatasetConfig.INT64}:
@@ -177,24 +182,28 @@ def get_explainer(self, output_cols=None):
         tmp_cols.append(tmp_keys[0].strip())
       self._output_cols = tmp_cols
     if len(self._output_cols) > 1:
-      logging.warning('Only one output can be supported currently, use the first one: %s', self._output_cols[0])
+      logging.warning(
+          'Only one output can be supported currently, use the first one: %s',
+          self._output_cols[0])
 
     output_name = self._output_cols[0]
     assert output_name in self.output_names, 'invalid output name `%s` not in model outputs `%s`' % (
-      output_name, ','.join(self.output_names))
+        output_name, ','.join(self.output_names))
     if output_name is None:
       output = self._outputs_map.values()[0]
     elif type(output_name) in {str, unicode}:
       output = self._outputs_map[output_name]
     else:
-      raise Exception('unsupported type of output_name: ' + str(type(output_name)))
+      raise Exception('unsupported type of output_name: ' +
+                      str(type(output_name)))
 
     def_vals = self.default_values()
     # print('default values (%d):' % len(def_vals), def_vals)
     inputs = [self._inputs_map[name] for name in self._input_names]
     # e = DeepShap(inputs, output, def_vals, session=self._session)
     # self._explainer = e
-    e = self.deep_explain.get_explainer(self.method, output, inputs, baseline=def_vals)
+    e = self.deep_explain.get_explainer(
+        self.method, output, inputs, baseline=def_vals)
     return e
 
   @property
@@ -236,6 +245,7 @@ def feature_importance(self,
 
 
 class OdpsExplainer(Explainer):
+
   def feature_importance(self,
                          input_path,
                          output_path,
@@ -247,17 +257,24 @@ def feature_importance(self,
     input_cols = self.input_names
     input_dim = len(input_cols)
     if reserved_cols:
-      reserved_cols = [x.strip() for x in reserved_cols.split(',') if x.strip() not in input_cols]
+      reserved_cols = [
+          x.strip()
+          for x in reserved_cols.split(',')
+          if x.strip() not in input_cols
+      ]
       input_cols.extend(reserved_cols)
     selected_cols = ','.join(input_cols)
-    print("selected_cols: " + selected_cols)
+    print('selected_cols: ' + selected_cols)
 
     explainer = self.get_explainer(output_cols)
-    print("reference value:", explainer.expected_value)
+    print('reference value:', explainer.expected_value)
 
     import common_io
-    reader = common_io.table.TableReader(input_path, selected_cols=selected_cols,
-                                         slice_id=slice_id, slice_count=slice_num)
+    reader = common_io.table.TableReader(
+        input_path,
+        selected_cols=selected_cols,
+        slice_id=slice_id,
+        slice_count=slice_num)
 
     reserved_cols_idx = []
     if reserved_cols:
@@ -302,13 +319,15 @@ def feature_importance(self,
 
 
 class OdpsRtpExplainer(Explainer):
+
   def __init__(self, deep_explain, model_path, method_name):
-    super(OdpsRtpExplainer, self).__init__(deep_explain, model_path, method_name)
+    super(OdpsRtpExplainer, self).__init__(deep_explain, model_path,
+                                           method_name)
     pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
     if not gfile.Exists(pipeline_path):
       logging.warning(
-        '%s not exists, default values maybe inconsistent with the values used in training.'
-        % pipeline_path)
+          '%s not exists, default values maybe inconsistent with the values used in training.'
+          % pipeline_path)
       return
     pipeline_config = get_configs_from_pipeline_file(pipeline_path)
     self._fg_separator = pipeline_config.data_config.separator
@@ -325,19 +344,20 @@ def __init__(self, deep_explain, model_path, method_name):
       self._effective_fields = []
       for fc in feature_configs:
         for input_name in fc.input_names:
-          assert input_name in self._input_fields, 'invalid input_name in %s' % str(fc)
+          assert input_name in self._input_fields, 'invalid input_name in %s' % str(
+              fc)
           if input_name not in self._effective_fields:
             self._effective_fields.append(input_name)
       self._effective_fids = [
-        self._input_fields.index(x) for x in self._effective_fields
+          self._input_fields.index(x) for x in self._effective_fields
       ]
       # sort fids from small to large
       self._effective_fids = list(set(self._effective_fids))
       self._effective_fields = [
-        self._input_fields[x] for x in self._effective_fids
+          self._input_fields[x] for x in self._effective_fids
       ]
-      logging.info(
-        "raw input fields: %d, effective fields: %d" % (len(self._input_fields), len(self._effective_fields)))
+      logging.info('raw input fields: %d, effective fields: %d' %
+                   (len(self._input_fields), len(self._effective_fields)))
 
   def feature_importance(self,
                          input_path,
@@ -352,14 +372,17 @@ def feature_importance(self,
     if 'features' not in input_cols:
       input_cols.append('features')
     selected_cols = ','.join(input_cols)
-    print("selected_cols: " + selected_cols)
+    print('selected_cols: ' + selected_cols)
 
     explainer = self.get_explainer(output_cols)
-    print("reference value:", explainer.expected_value)
+    print('reference value:', explainer.expected_value)
 
     import common_io
-    reader = common_io.table.TableReader(input_path, selected_cols=selected_cols,
-                                         slice_id=slice_id, slice_count=slice_num)
+    reader = common_io.table.TableReader(
+        input_path,
+        selected_cols=selected_cols,
+        slice_id=slice_id,
+        slice_count=slice_num)
 
     sum_t0, sum_t1, sum_t2 = 0, 0, 0
     writer = common_io.table.TableWriter(output_path, slice_id=slice_id)
@@ -373,9 +396,11 @@ def feature_importance(self,
       for j in range(len(records)):
         if reserved_dim > 0:
           reserved.append(records[j][:reserved_dim])
-        inputs.append(self.str_to_number(records[j][-1].decode('utf-8').split(self._fg_separator)))
+        inputs.append(
+            self.str_to_number(records[j][-1].decode('utf-8').split(
+                self._fg_separator)))
       inputs = list(np.array(inputs).T)
-      print("inputs:", inputs)
+      print('inputs:', inputs)
       # sv = explainer.shap_values(inputs, check_additivity=False)
       ret = explainer.run(inputs, batch_size=len(records))
       ret = np.array(ret)
@@ -406,8 +431,8 @@ def _get_input_fields_from_pipeline_config(model_path):
   pipeline_path = os.path.join(model_path, 'assets/pipeline.config')
   if not gfile.Exists(pipeline_path):
     logging.warning(
-      '%s not exists, default values maybe inconsistent with the values used in training.'
-      % pipeline_path)
+        '%s not exists, default values maybe inconsistent with the values used in training.'
+        % pipeline_path)
     return {}, []
   pipeline_config = get_configs_from_pipeline_file(pipeline_path)
   data_config = pipeline_config.data_config
@@ -418,11 +443,15 @@ def _get_input_fields_from_pipeline_config(model_path):
 
   input_fields = data_config.input_fields
   input_fields_info = {
-    input_field.input_name:
-      (input_field.input_type, input_field.default_val)
-    for input_field in input_fields if input_field.input_name not in labels
+      input_field.input_name: (input_field.input_type, input_field.default_val)
+      for input_field in input_fields
+      if input_field.input_name not in labels
   }
-  input_fields_list = [input_field.input_name for input_field in input_fields if input_field.input_name not in labels]
+  input_fields_list = [
+      input_field.input_name
+      for input_field in input_fields
+      if input_field.input_name not in labels
+  ]
   return input_fields_info, input_fields_list
 
 
@@ -448,12 +477,11 @@ def search_pb(directory, use_latest=False):
     if use_latest:
       logging.info('find %d models: %s' % (len(dir_list), ','.join(dir_list)))
       dir_list = sorted(
-        dir_list,
-        key=lambda x: int(x.split('/')[(-2 if (x[-1] == '/') else -1)]))
+          dir_list,
+          key=lambda x: int(x.split('/')[(-2 if (x[-1] == '/') else -1)]))
       return dir_list[-1]
     else:
-      raise ValueError('multiple saved model found in directory %s' %
-                       directory)
+      raise ValueError('multiple saved model found in directory %s' % directory)
 
   return dir_list[0]
 
@@ -490,17 +518,17 @@ def run(FLAGS):
 
   gpu_options = tf.GPUOptions(allow_growth=True)
   session_config = tf.ConfigProto(
-    gpu_options=gpu_options,
-    allow_soft_placement=True)
+      gpu_options=gpu_options, allow_soft_placement=True)
   session = tf.Session(config=session_config)
 
   worker_count = len(FLAGS.worker_hosts.split(','))
   with DeepExplain(session=session) as de:
     e = OdpsRtpExplainer(de, model_path, 'deeplift')
-    e.feature_importance(FLAGS.explain_tables if FLAGS.explain_tables else FLAGS.tables,
-                         FLAGS.outputs,
-                         reserved_cols=FLAGS.reserved_cols,
-                         output_cols=FLAGS.output_cols,
-                         batch_size=FLAGS.batch_size,
-                         slice_id=FLAGS.task_index,
-                         slice_num=worker_count)
+    e.feature_importance(
+        FLAGS.explain_tables if FLAGS.explain_tables else FLAGS.tables,
+        FLAGS.outputs,
+        reserved_cols=FLAGS.reserved_cols,
+        output_cols=FLAGS.output_cols,
+        batch_size=FLAGS.batch_size,
+        slice_id=FLAGS.task_index,
+        slice_num=worker_count)
diff --git a/easy_rec/python/tools/explainer/feature_importance.py b/easy_rec/python/tools/explainer/feature_importance.py
index 034f3c0da..7085274ab 100644
--- a/easy_rec/python/tools/explainer/feature_importance.py
+++ b/easy_rec/python/tools/explainer/feature_importance.py
@@ -1,9 +1,13 @@
 from __future__ import print_function
-from easy_rec.python.tools.explainer.explainer import run
+
 import tensorflow as tf
+
+from easy_rec.python.tools.explainer.explainer import run
+
 flags = tf.app.flags
 
-flags.DEFINE_string('saved_model_dir', '', 'directory where saved_model.pb exists')
+flags.DEFINE_string('saved_model_dir', '',
+                    'directory where saved_model.pb exists')
 flags.DEFINE_string('explain_tables', '', 'tables used for explaination')
 flags.DEFINE_string('background_table', '', 'tables used for expected value')
 flags.DEFINE_string('tables', '', 'tables passed by pai command')
@@ -18,7 +22,8 @@
     'output_cols', None,
     'output columns, such as: score float. multiple columns are separated by ,')
 flags.DEFINE_integer('batch_size', 1024, 'predict batch size')
-flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of hostname:port pairs')
+flags.DEFINE_string('worker_hosts', '',
+                    'Comma-separated list of hostname:port pairs')
 flags.DEFINE_integer('task_index', 0, 'Index of task within the job')
 
 FLAGS = flags.FLAGS
@@ -28,7 +33,7 @@ def main(_):
   for k in FLAGS:
     if k in ('h', 'help', 'helpshort', 'helpfull'):
       continue
-    print("%s=%s" % (k, FLAGS[k].value))
+    print('%s=%s' % (k, FLAGS[k].value))
 
   # worker_count = len(FLAGS.worker_hosts.split(','))
   # e = create_explainer(FLAGS.saved_model_dir)
diff --git a/easy_rec/python/tools/explainer/methods.py b/easy_rec/python/tools/explainer/methods.py
index aa7192acc..38c53be55 100644
--- a/easy_rec/python/tools/explainer/methods.py
+++ b/easy_rec/python/tools/explainer/methods.py
@@ -2,60 +2,62 @@
 from __future__ import division
 from __future__ import print_function
 
+import logging
 import sys
+import warnings
+from collections import OrderedDict
+
 import numpy as np
-from skimage.util import view_as_windows
-import warnings, logging
 import tensorflow as tf
+from skimage.util import view_as_windows
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import nn_grad, math_grad
-from collections import OrderedDict
-from easy_rec.python.tools.explainer.utils import make_batches, slice_arrays, to_list, unpack_singleton
+from tensorflow.python.ops import math_grad
+from tensorflow.python.ops import nn_grad
+
+from easy_rec.python.tools.explainer.utils import make_batches
+from easy_rec.python.tools.explainer.utils import slice_arrays
+from easy_rec.python.tools.explainer.utils import to_list
+from easy_rec.python.tools.explainer.utils import unpack_singleton
 
-SUPPORTED_ACTIVATIONS = [
-    'Relu', 'Elu', 'Sigmoid', 'Tanh', 'Softplus'
-]
+SUPPORTED_ACTIVATIONS = ['Relu', 'Elu', 'Sigmoid', 'Tanh', 'Softplus']
 
-UNSUPPORTED_ACTIVATIONS = [
-    'CRelu', 'Relu6', 'Softsign'
-]
+UNSUPPORTED_ACTIVATIONS = ['CRelu', 'Relu6', 'Softsign']
 
 _ENABLED_METHOD_CLASS = None
 _GRAD_OVERRIDE_CHECKFLAG = 0
 
-
 # -----------------------------------------------------------------------------
 # UTILITY FUNCTIONS
 # -----------------------------------------------------------------------------
 
 
 def activation(type):
-    """
-    Returns Tensorflow's activation op, given its type
-    :param type: string
-    :return: op
-    """
-    if type not in SUPPORTED_ACTIVATIONS:
-        warnings.warn('Activation function (%s) not supported' % type)
-    f = getattr(tf.nn, type.lower())
-    return f
+  """Returns Tensorflow's activation op, given its type.
+
+  :param type: string
+  :return: op
+  """
+  if type not in SUPPORTED_ACTIVATIONS:
+    warnings.warn('Activation function (%s) not supported' % type)
+  f = getattr(tf.nn, type.lower())
+  return f
 
 
 def original_grad(op, grad):
-    """
-    Return original Tensorflow gradient for an op
-    :param op: op
-    :param grad: Tensor
-    :return: Tensor
-    """
-    if op.type not in SUPPORTED_ACTIVATIONS:
-        warnings.warn('Activation function (%s) not supported' % op.type)
-    opname = '_%sGrad' % op.type
-    if hasattr(nn_grad, opname):
-        f = getattr(nn_grad, opname)
-    else:
-        f = getattr(math_grad, opname)
-    return f(op, grad)
+  """Return original Tensorflow gradient for an op.
+
+  :param op: op
+  :param grad: Tensor
+  :return: Tensor
+  """
+  if op.type not in SUPPORTED_ACTIVATIONS:
+    warnings.warn('Activation function (%s) not supported' % op.type)
+  opname = '_%sGrad' % op.type
+  if hasattr(nn_grad, opname):
+    f = getattr(nn_grad, opname)
+  else:
+    f = getattr(math_grad, opname)
+  return f(op, grad)
 
 
 # -----------------------------------------------------------------------------
@@ -64,172 +66,194 @@ def original_grad(op, grad):
 
 
 class AttributionMethod(object):
-    """
-    Attribution method base class
-    """
-    def __init__(self, T, X, session, keras_learning_phase=None):
-        self.T = T  # target Tensor
-        self.X = X  # input Tensor
-        self.Y_shape = [None,] + T.get_shape().as_list()[1:]
-        # Most often T contains multiple output units. In this case, it is often necessary to select
-        # a single unit to compute contributions for. This can be achieved passing 'ys' as weight for the output Tensor.
-        self.Y = tf.placeholder(tf.float32, self.Y_shape)
-        # placeholder_from_data(ys) if ys is not None else 1.0  # Tensor that represents weights for T
-        self.T = self.T * self.Y
-        self.symbolic_attribution = None
-        self.session = session
-        self.keras_learning_phase = keras_learning_phase
-        self.has_multiple_inputs = type(self.X) is list or type(self.X) is tuple
-        logging.info('Model with multiple inputs: %s' % self.has_multiple_inputs)
-
-        # Set baseline
-        # TODO: now this sets a baseline also for those methods that does not require it
-        self._set_check_baseline()
-
-        # References
-        self._init_references()
-
-        # Create symbolic explanation once during construction (affects only gradient-based methods)
-        self.explain_symbolic()
-
-    def explain_symbolic(self):
-        return None
-
-    def run(self, xs, ys=None, batch_size=None):
-        pass
-
-    def _init_references(self):
-        pass
-
-    def _check_input_compatibility(self, xs, ys=None, batch_size=None):
+  """Attribution method base class."""
+
+  def __init__(self, T, X, session, keras_learning_phase=None):
+    self.T = T  # target Tensor
+    self.X = X  # input Tensor
+    self.Y_shape = [
+        None,
+    ] + T.get_shape().as_list()[1:]
+    # Most often T contains multiple output units. In this case, it is often necessary to select
+    # a single unit to compute contributions for. This can be achieved passing 'ys' as weight for the output Tensor.
+    self.Y = tf.placeholder(tf.float32, self.Y_shape)
+    # placeholder_from_data(ys) if ys is not None else 1.0  # Tensor that represents weights for T
+    self.T = self.T * self.Y
+    self.symbolic_attribution = None
+    self.session = session
+    self.keras_learning_phase = keras_learning_phase
+    self.has_multiple_inputs = type(self.X) is list or type(self.X) is tuple
+    logging.info('Model with multiple inputs: %s' % self.has_multiple_inputs)
+
+    # Set baseline
+    # TODO: now this sets a baseline also for those methods that does not require it
+    self._set_check_baseline()
+
+    # References
+    self._init_references()
+
+    # Create symbolic explanation once during construction (affects only gradient-based methods)
+    self.explain_symbolic()
+
+  def explain_symbolic(self):
+    return None
+
+  def run(self, xs, ys=None, batch_size=None):
+    pass
+
+  def _init_references(self):
+    pass
+
+  def _check_input_compatibility(self, xs, ys=None, batch_size=None):
+    if ys is not None:
+      if not self.has_multiple_inputs and len(xs) != len(ys):
+        raise RuntimeError(
+            'When provided, ys must have the same batch size as xs (xs has batch size {} and ys {})'
+            .format(len(xs), len(ys)))
+      elif self.has_multiple_inputs and np.all([len(i) != len(ys) for i in xs]):
+        raise RuntimeError(
+            'When provided, ys must have the same batch size as all elements of xs'
+        )
+    if batch_size is not None and batch_size > 0:
+      if self.T.shape[0].value is not None and self.T.shape[
+          0].value is not batch_size:
+        raise RuntimeError(
+            'When using batch evaluation, the first dimension of the target tensor '
+            'must be compatible with the batch size. Found %s instead' %
+            self.T.shape[0].value)
+      if isinstance(self.X, list):
+        for x in self.X:
+          if x.shape[0].value is not None and x.shape[0].value is not batch_size:
+            raise RuntimeError(
+                'When using batch evaluation, the first dimension of the input tensor '
+                'must be compatible with the batch size. Found %s instead' %
+                x.shape[0].value)
+      else:
+        if self.X.shape[0].value is not None and self.X.shape[
+            0].value is not batch_size:
+          raise RuntimeError(
+              'When using batch evaluation, the first dimension of the input tensor '
+              'must be compatible with the batch size. Found %s instead' %
+              self.X.shape[0].value)
+
+  def _session_run_batch(self, T, xs, ys=None):
+    feed_dict = {}
+    if self.has_multiple_inputs:
+      for k, v in zip(self.X, xs):
+        feed_dict[k] = v
+    else:
+      feed_dict[self.X] = xs
+
+    # If ys is not passed, produce a vector of ones that will be broadcasted to all batch samples
+    feed_dict[self.Y] = ys if ys is not None else np.ones([
+        1,
+    ] + self.Y_shape[1:])
+
+    if self.keras_learning_phase is not None:
+      feed_dict[self.keras_learning_phase] = 0
+    return self.session.run(T, feed_dict)
+
+  def _session_run(self, T, xs, ys=None, batch_size=None):
+    num_samples = len(xs)
+    if self.has_multiple_inputs is True:
+      num_samples = len(xs[0])
+      if len(xs) != len(self.X):
+        raise RuntimeError(
+            'List of input tensors and input data have different lengths (%s and %s)'
+            % (str(len(xs)), str(len(self.X))))
+      if batch_size is not None:
+        for xi in xs:
+          if len(xi) != num_samples:
+            raise RuntimeError(
+                'Evaluation in batches requires all inputs to have '
+                'the same number of samples')
+
+    if batch_size is None or batch_size <= 0 or num_samples <= batch_size:
+      return self._session_run_batch(T, xs, ys)
+    else:
+      outs = []
+      batches = make_batches(num_samples, batch_size)
+      for batch_index, (batch_start, batch_end) in enumerate(batches):
+        # Get a batch from data
+        xs_batch = slice_arrays(xs, batch_start, batch_end)
+        # If the target tensor has one entry for each sample, we need to batch it as well
+        ys_batch = None
         if ys is not None:
-            if not self.has_multiple_inputs and len(xs) != len(ys):
-                raise RuntimeError('When provided, ys must have the same batch size as xs (xs has batch size {} and ys {})'.format(len(xs), len(ys)))
-            elif self.has_multiple_inputs and np.all([len(i) != len(ys) for i in xs]):
-                raise RuntimeError('When provided, ys must have the same batch size as all elements of xs')
-        if batch_size is not None and batch_size > 0:
-            if self.T.shape[0].value is not None and self.T.shape[0].value is not batch_size:
-                raise RuntimeError('When using batch evaluation, the first dimension of the target tensor '
-                                   'must be compatible with the batch size. Found %s instead' % self.T.shape[0].value)
-            if isinstance(self.X, list):
-                for x in self.X:
-                    if x.shape[0].value is not None and x.shape[0].value is not batch_size:
-                        raise RuntimeError('When using batch evaluation, the first dimension of the input tensor '
-                                           'must be compatible with the batch size. Found %s instead' % x.shape[
-                                               0].value)
-            else:
-                if self.X.shape[0].value is not None and self.X.shape[0].value is not batch_size:
-                    raise RuntimeError('When using batch evaluation, the first dimension of the input tensor '
-                                       'must be compatible with the batch size. Found %s instead' % self.X.shape[0].value)
-
-    def _session_run_batch(self, T, xs, ys=None):
-        feed_dict = {}
-        if self.has_multiple_inputs:
-            for k, v in zip(self.X, xs):
-                feed_dict[k] = v
-        else:
-            feed_dict[self.X] = xs
-
-        # If ys is not passed, produce a vector of ones that will be broadcasted to all batch samples
-        feed_dict[self.Y] = ys if ys is not None else np.ones([1,] + self.Y_shape[1:])
-
-        if self.keras_learning_phase is not None:
-            feed_dict[self.keras_learning_phase] = 0
-        return self.session.run(T, feed_dict)
-
-    def _session_run(self, T, xs, ys=None, batch_size=None):
-        num_samples = len(xs)
-        if self.has_multiple_inputs is True:
-            num_samples = len(xs[0])
-            if len(xs) != len(self.X):
-                raise RuntimeError('List of input tensors and input data have different lengths (%s and %s)'
-                                   % (str(len(xs)), str(len(self.X))))
-            if batch_size is not None:
-                for xi in xs:
-                    if len(xi) != num_samples:
-                        raise RuntimeError('Evaluation in batches requires all inputs to have '
-                                           'the same number of samples')
-
-        if batch_size is None or batch_size <= 0 or num_samples <= batch_size:
-            return self._session_run_batch(T, xs, ys)
-        else:
-            outs = []
-            batches = make_batches(num_samples, batch_size)
-            for batch_index, (batch_start, batch_end) in enumerate(batches):
-                # Get a batch from data
-                xs_batch = slice_arrays(xs, batch_start, batch_end)
-                # If the target tensor has one entry for each sample, we need to batch it as well
-                ys_batch = None
-                if ys is not None:
-                    ys_batch = slice_arrays(ys, batch_start, batch_end)
-                batch_outs = self._session_run_batch(T, xs_batch, ys_batch)
-                batch_outs = to_list(batch_outs)
-                if batch_index == 0:
-                    # Pre-allocate the results arrays.
-                    for batch_out in batch_outs:
-                        shape = (num_samples,) + batch_out.shape[1:]
-                        outs.append(np.zeros(shape, dtype=batch_out.dtype))
-                for i, batch_out in enumerate(batch_outs):
-                    outs[i][batch_start:batch_end] = batch_out
-            return unpack_singleton(outs)
-
-    def _set_check_baseline(self):
-        # Do nothing for those methods that have no baseline required
-        if not hasattr(self, "baseline"):
-            return
-
-        if self.baseline is None:
-            if self.has_multiple_inputs:
-                self.baseline = [np.zeros([1,] + xi.get_shape().as_list()[1:]) for xi in self.X]
-            else:
-                self.baseline = np.zeros([1,] + self.X.get_shape().as_list()[1:])
+          ys_batch = slice_arrays(ys, batch_start, batch_end)
+        batch_outs = self._session_run_batch(T, xs_batch, ys_batch)
+        batch_outs = to_list(batch_outs)
+        if batch_index == 0:
+          # Pre-allocate the results arrays.
+          for batch_out in batch_outs:
+            shape = (num_samples,) + batch_out.shape[1:]
+            outs.append(np.zeros(shape, dtype=batch_out.dtype))
+        for i, batch_out in enumerate(batch_outs):
+          outs[i][batch_start:batch_end] = batch_out
+      return unpack_singleton(outs)
+
+  def _set_check_baseline(self):
+    # Do nothing for those methods that have no baseline required
+    if not hasattr(self, 'baseline'):
+      return
+
+    if self.baseline is None:
+      if self.has_multiple_inputs:
+        self.baseline = [
+            np.zeros([
+                1,
+            ] + xi.get_shape().as_list()[1:]) for xi in self.X
+        ]
+      else:
+        self.baseline = np.zeros([
+            1,
+        ] + self.X.get_shape().as_list()[1:])
 
+    else:
+      if self.has_multiple_inputs:
+        for i, xi in enumerate(self.X):
+          if list(self.baseline[i].shape) == xi.get_shape().as_list()[1:]:
+            self.baseline[i] = np.expand_dims(self.baseline[i], 0)
+          else:
+            raise RuntimeError(
+                'Baseline shape %s does not match expected shape %s' %
+                (self.baseline[i].shape, xi.get_shape().as_list()[1:]))
+      else:
+        if list(self.baseline.shape) == self.X.get_shape().as_list()[1:]:
+          self.baseline = np.expand_dims(self.baseline, 0)
         else:
-            if self.has_multiple_inputs:
-                for i, xi in enumerate(self.X):
-                    if list(self.baseline[i].shape) == xi.get_shape().as_list()[1:]:
-                        self.baseline[i] = np.expand_dims(self.baseline[i], 0)
-                    else:
-                        raise RuntimeError('Baseline shape %s does not match expected shape %s'
-                                           % (self.baseline[i].shape, xi.get_shape().as_list()[1:]))
-            else:
-                if list(self.baseline.shape) == self.X.get_shape().as_list()[1:]:
-                    self.baseline = np.expand_dims(self.baseline, 0)
-                else:
-                    raise RuntimeError('Baseline shape %s does not match expected shape %s'
-                                       % (self.baseline.shape, self.X.get_shape().as_list()[1:]))
+          raise RuntimeError(
+              'Baseline shape %s does not match expected shape %s' %
+              (self.baseline.shape, self.X.get_shape().as_list()[1:]))
 
 
 class GradientBasedMethod(AttributionMethod):
-    """
-    Base class for gradient-based attribution methods
-    """
-    def get_symbolic_attribution(self):
-        return tf.gradients(self.T, self.X)
+  """Base class for gradient-based attribution methods."""
+
+  def get_symbolic_attribution(self):
+    return tf.gradients(self.T, self.X)
 
-    def explain_symbolic(self):
-        if self.symbolic_attribution is None:
-            self.symbolic_attribution = self.get_symbolic_attribution()
-        return self.symbolic_attribution
+  def explain_symbolic(self):
+    if self.symbolic_attribution is None:
+      self.symbolic_attribution = self.get_symbolic_attribution()
+    return self.symbolic_attribution
 
-    def run(self, xs, ys=None, batch_size=None):
-        self._check_input_compatibility(xs, ys, batch_size)
-        results = self._session_run(self.explain_symbolic(), xs, ys, batch_size)
-        return results[0] if not self.has_multiple_inputs else results
+  def run(self, xs, ys=None, batch_size=None):
+    self._check_input_compatibility(xs, ys, batch_size)
+    results = self._session_run(self.explain_symbolic(), xs, ys, batch_size)
+    return results[0] if not self.has_multiple_inputs else results
 
-    @classmethod
-    def nonlinearity_grad_override(cls, op, grad):
-        return original_grad(op, grad)
+  @classmethod
+  def nonlinearity_grad_override(cls, op, grad):
+    return original_grad(op, grad)
 
 
 class PerturbationBasedMethod(AttributionMethod):
-    """
-       Base class for perturbation-based attribution methods
-       """
-    def __init__(self, T, X, session, keras_learning_phase):
-        super(PerturbationBasedMethod, self).__init__(T, X, session, keras_learning_phase)
-        self.base_activation = None
+  """Base class for perturbation-based attribution methods."""
 
+  def __init__(self, T, X, session, keras_learning_phase):
+    super(PerturbationBasedMethod, self).__init__(T, X, session,
+                                                  keras_learning_phase)
+    self.base_activation = None
 
 
 # -----------------------------------------------------------------------------
@@ -242,13 +266,14 @@ def __init__(self, T, X, session, keras_learning_phase):
 
 class DummyZero(GradientBasedMethod):
 
-    def get_symbolic_attribution(self,):
-        return tf.gradients(self.T, self.X)
+  def get_symbolic_attribution(self,):
+    return tf.gradients(self.T, self.X)
+
+  @classmethod
+  def nonlinearity_grad_override(cls, op, grad):
+    input = op.inputs[0]
+    return tf.zeros_like(input)
 
-    @classmethod
-    def nonlinearity_grad_override(cls, op, grad):
-        input = op.inputs[0]
-        return tf.zeros_like(input)
 
 """
 Saliency maps
@@ -258,8 +283,8 @@ def nonlinearity_grad_override(cls, op, grad):
 
 class Saliency(GradientBasedMethod):
 
-    def get_symbolic_attribution(self):
-        return [tf.abs(g) for g in tf.gradients(self.T, self.X)]
+  def get_symbolic_attribution(self):
+    return [tf.abs(g) for g in tf.gradients(self.T, self.X)]
 
 
 """
@@ -270,10 +295,12 @@ def get_symbolic_attribution(self):
 
 class GradientXInput(GradientBasedMethod):
 
-    def get_symbolic_attribution(self):
-        return [g * x for g, x in zip(
+  def get_symbolic_attribution(self):
+    return [
+        g * x for g, x in zip(
             tf.gradients(self.T, self.X),
-            self.X if self.has_multiple_inputs else [self.X])]
+            self.X if self.has_multiple_inputs else [self.X])
+    ]
 
 
 """
@@ -284,28 +311,38 @@ def get_symbolic_attribution(self):
 
 class IntegratedGradients(GradientBasedMethod):
 
-    def __init__(self, T, X, session, keras_learning_phase, steps=100, baseline=None):
-        self.steps = steps
-        self.baseline = baseline
-        super(IntegratedGradients, self).__init__(T, X, session, keras_learning_phase)
-
-    def run(self, xs, ys=None, batch_size=None):
-        self._check_input_compatibility(xs, ys, batch_size)
-
-        gradient = None
-        for alpha in list(np.linspace(1. / self.steps, 1.0, self.steps)):
-            xs_mod = [b + (x - b) * alpha for x, b in zip(xs, self.baseline)] if self.has_multiple_inputs \
-                else self.baseline + (xs - self.baseline) * alpha
-            _attr = self._session_run(self.explain_symbolic(), xs_mod, ys, batch_size)
-            if gradient is None: gradient = _attr
-            else: gradient = [g + a for g, a in zip(gradient, _attr)]
-
-        results = [g * (x - b) / self.steps for g, x, b in zip(
-            gradient,
-            xs if self.has_multiple_inputs else [xs],
-            self.baseline if self.has_multiple_inputs else [self.baseline])]
-
-        return results[0] if not self.has_multiple_inputs else results
+  def __init__(self,
+               T,
+               X,
+               session,
+               keras_learning_phase,
+               steps=100,
+               baseline=None):
+    self.steps = steps
+    self.baseline = baseline
+    super(IntegratedGradients, self).__init__(T, X, session,
+                                              keras_learning_phase)
+
+  def run(self, xs, ys=None, batch_size=None):
+    self._check_input_compatibility(xs, ys, batch_size)
+
+    gradient = None
+    for alpha in list(np.linspace(1. / self.steps, 1.0, self.steps)):
+      xs_mod = [b + (x - b) * alpha for x, b in zip(xs, self.baseline)] if self.has_multiple_inputs \
+          else self.baseline + (xs - self.baseline) * alpha
+      _attr = self._session_run(self.explain_symbolic(), xs_mod, ys, batch_size)
+      if gradient is None:
+        gradient = _attr
+      else:
+        gradient = [g + a for g, a in zip(gradient, _attr)]
+
+    results = [
+        g * (x - b) / self.steps for g, x, b in zip(
+            gradient, xs if self.has_multiple_inputs else [xs],
+            self.baseline if self.has_multiple_inputs else [self.baseline])
+    ]
+
+    return results[0] if not self.has_multiple_inputs else results
 
 
 """
@@ -315,25 +352,29 @@ def run(self, xs, ys=None, batch_size=None):
 
 
 class EpsilonLRP(GradientBasedMethod):
-    eps = None
+  eps = None
 
-    def __init__(self, T, X, session, keras_learning_phase, epsilon=1e-4):
-        assert epsilon > 0.0, 'LRP epsilon must be greater than zero'
-        global eps
-        eps = epsilon
-        super(EpsilonLRP, self).__init__(T, X, session, keras_learning_phase)
+  def __init__(self, T, X, session, keras_learning_phase, epsilon=1e-4):
+    assert epsilon > 0.0, 'LRP epsilon must be greater than zero'
+    global eps
+    eps = epsilon
+    super(EpsilonLRP, self).__init__(T, X, session, keras_learning_phase)
 
-    def get_symbolic_attribution(self):
-        return [g * x for g, x in zip(
+  def get_symbolic_attribution(self):
+    return [
+        g * x for g, x in zip(
             tf.gradients(self.T, self.X),
-            self.X if self.has_multiple_inputs else [self.X])]
+            self.X if self.has_multiple_inputs else [self.X])
+    ]
+
+  @classmethod
+  def nonlinearity_grad_override(cls, op, grad):
+    output = op.outputs[0]
+    input = op.inputs[0]
+    return grad * output / (
+        input + eps *
+        tf.where(input >= 0, tf.ones_like(input), -1 * tf.ones_like(input)))
 
-    @classmethod
-    def nonlinearity_grad_override(cls, op, grad):
-        output = op.outputs[0]
-        input = op.inputs[0]
-        return grad * output / (input + eps *
-                                tf.where(input >= 0, tf.ones_like(input), -1 * tf.ones_like(input)))
 
 """
 DeepLIFT
@@ -344,45 +385,48 @@ def nonlinearity_grad_override(cls, op, grad):
 
 class DeepLIFTRescale(GradientBasedMethod):
 
-    _deeplift_ref = {}
+  _deeplift_ref = {}
 
-    def __init__(self, T, X, session, keras_learning_phase, baseline=None):
-        self.baseline = baseline
-        super(DeepLIFTRescale, self).__init__(T, X, session, keras_learning_phase)
+  def __init__(self, T, X, session, keras_learning_phase, baseline=None):
+    self.baseline = baseline
+    super(DeepLIFTRescale, self).__init__(T, X, session, keras_learning_phase)
 
-    def get_symbolic_attribution(self):
-        return [g * (x - b) for g, x, b in zip(
+  def get_symbolic_attribution(self):
+    return [
+        g * (x - b) for g, x, b in zip(
             tf.gradients(self.T, self.X),
             self.X if self.has_multiple_inputs else [self.X],
-            self.baseline if self.has_multiple_inputs else [self.baseline])]
-
-    @classmethod
-    def nonlinearity_grad_override(cls, op, grad):
-        output = op.outputs[0]
-        input = op.inputs[0]
-        ref_input = cls._deeplift_ref[op.name]
-        ref_output = activation(op.type)(ref_input)
-        delta_out = output - ref_output
-        delta_in = input - ref_input
-        instant_grad = activation(op.type)(0.5 * (ref_input + input))
-        return tf.where(tf.abs(delta_in) > 1e-5, grad * delta_out / delta_in,
-                        original_grad(instant_grad.op, grad))
-
-    def _init_references(self):
-        # print ('DeepLIFT: computing references...')
-        sys.stdout.flush()
-        self._deeplift_ref.clear()
-        ops = []
-        g = tf.get_default_graph()
-        for op in g.get_operations():
-            if len(op.inputs) > 0 and not op.name.startswith('gradients'):
-                if op.type in SUPPORTED_ACTIVATIONS:
-                    ops.append(op)
-        YR = self._session_run([o.inputs[0] for o in ops], self.baseline)
-        for (r, op) in zip(YR, ops):
-            self._deeplift_ref[op.name] = r
-        # print('DeepLIFT: references ready')
-        sys.stdout.flush()
+            self.baseline if self.has_multiple_inputs else [self.baseline])
+    ]
+
+  @classmethod
+  def nonlinearity_grad_override(cls, op, grad):
+    output = op.outputs[0]
+    input = op.inputs[0]
+    ref_input = cls._deeplift_ref[op.name]
+    ref_output = activation(op.type)(ref_input)
+    delta_out = output - ref_output
+    delta_in = input - ref_input
+    instant_grad = activation(op.type)(0.5 * (ref_input + input))
+    return tf.where(
+        tf.abs(delta_in) > 1e-5, grad * delta_out / delta_in,
+        original_grad(instant_grad.op, grad))
+
+  def _init_references(self):
+    # print ('DeepLIFT: computing references...')
+    sys.stdout.flush()
+    self._deeplift_ref.clear()
+    ops = []
+    g = tf.get_default_graph()
+    for op in g.get_operations():
+      if len(op.inputs) > 0 and not op.name.startswith('gradients'):
+        if op.type in SUPPORTED_ACTIVATIONS:
+          ops.append(op)
+    YR = self._session_run([o.inputs[0] for o in ops], self.baseline)
+    for (r, op) in zip(YR, ops):
+      self._deeplift_ref[op.name] = r
+    # print('DeepLIFT: references ready')
+    sys.stdout.flush()
 
 
 """
@@ -401,58 +445,70 @@ def _init_references(self):
 
 class Occlusion(PerturbationBasedMethod):
 
-    def __init__(self, T, X, session, keras_learning_phase, window_shape=None, step=None):
-        super(Occlusion, self).__init__(T, X, session, keras_learning_phase)
-        if self.has_multiple_inputs:
-            raise RuntimeError('Multiple inputs not yet supported for perturbation methods')
-
-        input_shape = X[0].get_shape().as_list()
-        if window_shape is not None:
-            assert len(window_shape) == len(input_shape), \
-                'window_shape must have length of input (%d)' % len(input_shape)
-            self.window_shape = tuple(window_shape)
-        else:
-            self.window_shape = (1,) * len(input_shape)
+  def __init__(self,
+               T,
+               X,
+               session,
+               keras_learning_phase,
+               window_shape=None,
+               step=None):
+    super(Occlusion, self).__init__(T, X, session, keras_learning_phase)
+    if self.has_multiple_inputs:
+      raise RuntimeError(
+          'Multiple inputs not yet supported for perturbation methods')
+
+    input_shape = X[0].get_shape().as_list()
+    if window_shape is not None:
+      assert len(window_shape) == len(input_shape), \
+          'window_shape must have length of input (%d)' % len(input_shape)
+      self.window_shape = tuple(window_shape)
+    else:
+      self.window_shape = (1,) * len(input_shape)
 
-        if step is not None:
-            assert isinstance(step, int) or len(step) == len(input_shape), \
-                'step must be integer or tuple with the length of input (%d)' % len(input_shape)
-            self.step = step
-        else:
-            self.step = 1
-        self.replace_value = 0.0
-        logging.info('Input shape: %s; window_shape %s; step %s' % (input_shape, self.window_shape, self.step))
-
-    def run(self, xs, ys=None, batch_size=None):
-        self._check_input_compatibility(xs, ys, batch_size)
-        input_shape = xs.shape[1:]
-        batch_size = xs.shape[0]
-        total_dim = np.asscalar(np.prod(input_shape))
-
-        # Create mask
-        index_matrix = np.arange(total_dim).reshape(input_shape)
-        idx_patches = view_as_windows(index_matrix, self.window_shape, self.step).reshape((-1,) + self.window_shape)
-        heatmap = np.zeros_like(xs, dtype=np.float32).reshape((-1), total_dim)
-        w = np.zeros_like(heatmap)
-
-        # Compute original output
-        eval0 = self._session_run(self.T, xs, ys, batch_size)
-
-        # Start perturbation loop
-        for i, p in enumerate(idx_patches):
-            mask = np.ones(input_shape).flatten()
-            mask[p.flatten()] = self.replace_value
-            masked_xs = mask.reshape((1,) + input_shape) * xs
-            delta = eval0 - self._session_run(self.T, masked_xs, ys, batch_size)
-            delta_aggregated = np.sum(delta.reshape((batch_size, -1)), -1, keepdims=True)
-            heatmap[:, p.flatten()] += delta_aggregated
-            w[:, p.flatten()] += p.size
-
-        attribution = np.reshape(heatmap / w, xs.shape)
-        if np.isnan(attribution).any():
-            warnings.warn('Attributions generated by Occlusion method contain nans, '
-                          'probably because window_shape and step do not allow to cover the all input.')
-        return attribution
+    if step is not None:
+      assert isinstance(step, int) or len(step) == len(input_shape), \
+          'step must be integer or tuple with the length of input (%d)' % len(input_shape)
+      self.step = step
+    else:
+      self.step = 1
+    self.replace_value = 0.0
+    logging.info('Input shape: %s; window_shape %s; step %s' %
+                 (input_shape, self.window_shape, self.step))
+
+  def run(self, xs, ys=None, batch_size=None):
+    self._check_input_compatibility(xs, ys, batch_size)
+    input_shape = xs.shape[1:]
+    batch_size = xs.shape[0]
+    total_dim = np.asscalar(np.prod(input_shape))
+
+    # Create mask
+    index_matrix = np.arange(total_dim).reshape(input_shape)
+    idx_patches = view_as_windows(index_matrix, self.window_shape,
+                                  self.step).reshape((-1,) + self.window_shape)
+    heatmap = np.zeros_like(xs, dtype=np.float32).reshape((-1), total_dim)
+    w = np.zeros_like(heatmap)
+
+    # Compute original output
+    eval0 = self._session_run(self.T, xs, ys, batch_size)
+
+    # Start perturbation loop
+    for i, p in enumerate(idx_patches):
+      mask = np.ones(input_shape).flatten()
+      mask[p.flatten()] = self.replace_value
+      masked_xs = mask.reshape((1,) + input_shape) * xs
+      delta = eval0 - self._session_run(self.T, masked_xs, ys, batch_size)
+      delta_aggregated = np.sum(
+          delta.reshape((batch_size, -1)), -1, keepdims=True)
+      heatmap[:, p.flatten()] += delta_aggregated
+      w[:, p.flatten()] += p.size
+
+    attribution = np.reshape(heatmap / w, xs.shape)
+    if np.isnan(attribution).any():
+      warnings.warn(
+          'Attributions generated by Occlusion method contain nans, '
+          'probably because window_shape and step do not allow to cover the all input.'
+      )
+    return attribution
 
 
 """
@@ -460,7 +516,7 @@ def run(self, xs, ys=None, batch_size=None):
 Computes approximate Shapley Values using "Polynomial calculation of the Shapley value based on sampling",
 Castro et al, 2009 (https://www.sciencedirect.com/science/article/pii/S0305054808000804)
 samples : integer (default 5)
-Defined the number of samples for each input feature. 
+Defined the number of samples for each input feature.
 Notice that evaluating a model samples * n_input_feature times might take a while.
 sampling_dims : list of dimension indexes to run sampling on (feature dimensions).
 By default, all dimensions except the batch dimension will be sampled.
@@ -471,61 +527,72 @@ def run(self, xs, ys=None, batch_size=None):
 
 class ShapleySampling(PerturbationBasedMethod):
 
-    def __init__(self, T, X, session, keras_learning_phase, samples=5, sampling_dims=None):
-        super(ShapleySampling, self).__init__(T, X, session, keras_learning_phase)
-        if self.has_multiple_inputs:
-            raise RuntimeError('Multiple inputs not yet supported for perturbation methods')
-        dims = len(X.shape)
-        if sampling_dims is not None:
-            if not 0 < len(sampling_dims) <= (dims - 1):
-                raise RuntimeError('sampling_dims must be a list containing 1 to %d elements' % (dims-1))
-            if 0 in sampling_dims:
-                raise RuntimeError('Cannot sample batch dimension: remove 0 from sampling_dims')
-            if any([x < 1 or x > dims-1 for x in sampling_dims]):
-                raise RuntimeError('Invalid value in sampling_dims')
-        else:
-            sampling_dims = list(range(1, dims))
-
-        self.samples = samples
-        self.sampling_dims = sampling_dims
-
-    def run(self, xs, ys=None, batch_size=None):
-        xs_shape = list(xs.shape)
-        batch_size = xs.shape[0]
-        n_features = int(np.asscalar(np.prod([xs.shape[i] for i in self.sampling_dims])))
-        result = np.zeros((xs_shape[0], n_features))
-
-        run_shape = list(xs_shape)  # a copy
-        run_shape = np.delete(run_shape, self.sampling_dims).tolist()
-        run_shape.insert(1, -1)
-
-        reconstruction_shape = [xs_shape[0]]
-        for j in self.sampling_dims:
-            reconstruction_shape.append(xs_shape[j])
-
-        for r in range(self.samples):
-            p = np.random.permutation(n_features)
-            x = xs.copy().reshape(run_shape)
-            y = None
-            for i in p:
-                if y is None:
-                    y = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
-                x[:, i] = 0
-                y0 = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
-                delta = y - y0
-                delta_aggregated = np.sum(delta.reshape((batch_size, -1)), -1, keepdims=False)
-                result[:, i] += delta_aggregated
-                y = y0
-
-        shapley = result / self.samples
-        return shapley.reshape(reconstruction_shape)
+  def __init__(self,
+               T,
+               X,
+               session,
+               keras_learning_phase,
+               samples=5,
+               sampling_dims=None):
+    super(ShapleySampling, self).__init__(T, X, session, keras_learning_phase)
+    if self.has_multiple_inputs:
+      raise RuntimeError(
+          'Multiple inputs not yet supported for perturbation methods')
+    dims = len(X.shape)
+    if sampling_dims is not None:
+      if not 0 < len(sampling_dims) <= (dims - 1):
+        raise RuntimeError(
+            'sampling_dims must be a list containing 1 to %d elements' %
+            (dims - 1))
+      if 0 in sampling_dims:
+        raise RuntimeError(
+            'Cannot sample batch dimension: remove 0 from sampling_dims')
+      if any([x < 1 or x > dims - 1 for x in sampling_dims]):
+        raise RuntimeError('Invalid value in sampling_dims')
+    else:
+      sampling_dims = list(range(1, dims))
+
+    self.samples = samples
+    self.sampling_dims = sampling_dims
+
+  def run(self, xs, ys=None, batch_size=None):
+    xs_shape = list(xs.shape)
+    batch_size = xs.shape[0]
+    n_features = int(
+        np.asscalar(np.prod([xs.shape[i] for i in self.sampling_dims])))
+    result = np.zeros((xs_shape[0], n_features))
+
+    run_shape = list(xs_shape)  # a copy
+    run_shape = np.delete(run_shape, self.sampling_dims).tolist()
+    run_shape.insert(1, -1)
+
+    reconstruction_shape = [xs_shape[0]]
+    for j in self.sampling_dims:
+      reconstruction_shape.append(xs_shape[j])
+
+    for r in range(self.samples):
+      p = np.random.permutation(n_features)
+      x = xs.copy().reshape(run_shape)
+      y = None
+      for i in p:
+        if y is None:
+          y = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
+        x[:, i] = 0
+        y0 = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
+        delta = y - y0
+        delta_aggregated = np.sum(
+            delta.reshape((batch_size, -1)), -1, keepdims=False)
+        result[:, i] += delta_aggregated
+        y = y0
+
+    shapley = result / self.samples
+    return shapley.reshape(reconstruction_shape)
 
 
 # -----------------------------------------------------------------------------
 # END ATTRIBUTION METHODS
 # -----------------------------------------------------------------------------
 
-
 attribution_methods = OrderedDict({
     'zero': (DummyZero, 0),
     'saliency': (Saliency, 1),
@@ -538,104 +605,117 @@ def run(self, xs, ys=None, batch_size=None):
 })
 
 
-
-@ops.RegisterGradient("DeepExplainGrad")
+@ops.RegisterGradient('DeepExplainGrad')
 def deepexplain_grad(op, grad):
-    global _ENABLED_METHOD_CLASS, _GRAD_OVERRIDE_CHECKFLAG
-    _GRAD_OVERRIDE_CHECKFLAG = 1
-    if _ENABLED_METHOD_CLASS is not None \
-            and issubclass(_ENABLED_METHOD_CLASS, GradientBasedMethod):
-        return _ENABLED_METHOD_CLASS.nonlinearity_grad_override(op, grad)
-    else:
-        return original_grad(op, grad)
+  global _ENABLED_METHOD_CLASS, _GRAD_OVERRIDE_CHECKFLAG
+  _GRAD_OVERRIDE_CHECKFLAG = 1
+  if _ENABLED_METHOD_CLASS is not None \
+          and issubclass(_ENABLED_METHOD_CLASS, GradientBasedMethod):
+    return _ENABLED_METHOD_CLASS.nonlinearity_grad_override(op, grad)
+  else:
+    return original_grad(op, grad)
 
 
 class DeepExplain(object):
 
-    def __init__(self, graph=None, session=tf.get_default_session()):
-        self.method = None
-        self.batch_size = None
-        self.session = session
-        self.graph = session.graph if graph is None else graph
-        self.graph_context = self.graph.as_default()
-        self.override_context = self.graph.gradient_override_map(self.get_override_map())
-        self.keras_phase_placeholder = None
-        self.context_on = False
-        if self.session is None:
-            raise RuntimeError('DeepExplain: could not retrieve a session. Use DeepExplain(session=your_session).')
-
-    def __enter__(self):
-        # Override gradient of all ops created in context
-        self.graph_context.__enter__()
-        self.override_context.__enter__()
-        self.context_on = True
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self.graph_context.__exit__(type, value, traceback)
-        self.override_context.__exit__(type, value, traceback)
-        self.context_on = False
-
-    def get_explainer(self, method, T, X, **kwargs):
-        if not self.context_on:
-            raise RuntimeError('Explain can be called only within a DeepExplain context.')
-        global _ENABLED_METHOD_CLASS, _GRAD_OVERRIDE_CHECKFLAG
-        self.method = method
-        if self.method in attribution_methods:
-            method_class, method_flag = attribution_methods[self.method]
-        else:
-            raise RuntimeError('Method must be in %s' % list(attribution_methods.keys()))
-        if isinstance(X, list):
-            for x in X:
-                if 'tensor' not in str(type(x)).lower():
-                    raise RuntimeError('If a list, X must contain only Tensorflow Tensor objects')
-        else:
-            if 'tensor' not in str(type(X)).lower():
-                raise RuntimeError('X must be a Tensorflow Tensor object or a list of them')
-
-        if 'tensor' not in str(type(T)).lower():
-            raise RuntimeError('T must be a Tensorflow Tensor object')
-
-        logging.info('DeepExplain: running "%s" explanation method (%d)' % (self.method, method_flag))
-        self._check_ops()
-        _GRAD_OVERRIDE_CHECKFLAG = 0
-
-        _ENABLED_METHOD_CLASS = method_class
-        method = _ENABLED_METHOD_CLASS(T, X,
-                                       self.session,
-                                       keras_learning_phase=self.keras_phase_placeholder,
-                                       **kwargs)
-
-        if issubclass(_ENABLED_METHOD_CLASS, GradientBasedMethod) and _GRAD_OVERRIDE_CHECKFLAG == 0:
-            warnings.warn('DeepExplain detected you are trying to use an attribution method that requires '
-                          'gradient override but the original gradient was used instead. You might have forgot to '
-                          '(re)create your graph within the DeepExlain context. Results are not reliable!')
-        _ENABLED_METHOD_CLASS = None
-        _GRAD_OVERRIDE_CHECKFLAG = 0
-        self.keras_phase_placeholder = None
-        return method
-
-    def explain(self, method, T, X, xs, ys=None, batch_size=None, **kwargs):
-        explainer = self.get_explainer(method, T, X, **kwargs)
-        return explainer.run(xs, ys, batch_size)
-
-    @staticmethod
-    def get_override_map():
-        return dict((a, 'DeepExplainGrad') for a in SUPPORTED_ACTIVATIONS)
-
-    def _check_ops(self):
-        """
-        Heuristically check if any op is in the list of unsupported activation functions.
-        This does not cover all cases where explanation methods would fail, and must be improved in the future.
-        Also, check if the placeholder named 'keras_learning_phase' exists in the graph. This is used by Keras
-         and needs to be passed in feed_dict.
-        :return:
-        """
-        g = tf.get_default_graph()
-        for op in g.get_operations():
-            if len(op.inputs) > 0 and not op.name.startswith('gradients'):
-                if op.type in UNSUPPORTED_ACTIVATIONS:
-                    warnings.warn('Detected unsupported activation (%s). '
-                                  'This might lead to unexpected or wrong results.' % op.type)
-            elif 'keras_learning_phase' in op.name:
-                self.keras_phase_placeholder = op.outputs[0]
\ No newline at end of file
+  def __init__(self, graph=None, session=tf.get_default_session()):
+    self.method = None
+    self.batch_size = None
+    self.session = session
+    self.graph = session.graph if graph is None else graph
+    self.graph_context = self.graph.as_default()
+    self.override_context = self.graph.gradient_override_map(
+        self.get_override_map())
+    self.keras_phase_placeholder = None
+    self.context_on = False
+    if self.session is None:
+      raise RuntimeError(
+          'DeepExplain: could not retrieve a session. Use DeepExplain(session=your_session).'
+      )
+
+  def __enter__(self):
+    # Override gradient of all ops created in context
+    self.graph_context.__enter__()
+    self.override_context.__enter__()
+    self.context_on = True
+    return self
+
+  def __exit__(self, type, value, traceback):
+    self.graph_context.__exit__(type, value, traceback)
+    self.override_context.__exit__(type, value, traceback)
+    self.context_on = False
+
+  def get_explainer(self, method, T, X, **kwargs):
+    if not self.context_on:
+      raise RuntimeError(
+          'Explain can be called only within a DeepExplain context.')
+    global _ENABLED_METHOD_CLASS, _GRAD_OVERRIDE_CHECKFLAG
+    self.method = method
+    if self.method in attribution_methods:
+      method_class, method_flag = attribution_methods[self.method]
+    else:
+      raise RuntimeError('Method must be in %s' %
+                         list(attribution_methods.keys()))
+    if isinstance(X, list):
+      for x in X:
+        if 'tensor' not in str(type(x)).lower():
+          raise RuntimeError(
+              'If a list, X must contain only Tensorflow Tensor objects')
+    else:
+      if 'tensor' not in str(type(X)).lower():
+        raise RuntimeError(
+            'X must be a Tensorflow Tensor object or a list of them')
+
+    if 'tensor' not in str(type(T)).lower():
+      raise RuntimeError('T must be a Tensorflow Tensor object')
+
+    logging.info('DeepExplain: running "%s" explanation method (%d)' %
+                 (self.method, method_flag))
+    self._check_ops()
+    _GRAD_OVERRIDE_CHECKFLAG = 0
+
+    _ENABLED_METHOD_CLASS = method_class
+    method = _ENABLED_METHOD_CLASS(
+        T,
+        X,
+        self.session,
+        keras_learning_phase=self.keras_phase_placeholder,
+        **kwargs)
+
+    if issubclass(_ENABLED_METHOD_CLASS,
+                  GradientBasedMethod) and _GRAD_OVERRIDE_CHECKFLAG == 0:
+      warnings.warn(
+          'DeepExplain detected you are trying to use an attribution method that requires '
+          'gradient override but the original gradient was used instead. You might have forgot to '
+          '(re)create your graph within the DeepExlain context. Results are not reliable!'
+      )
+    _ENABLED_METHOD_CLASS = None
+    _GRAD_OVERRIDE_CHECKFLAG = 0
+    self.keras_phase_placeholder = None
+    return method
+
+  def explain(self, method, T, X, xs, ys=None, batch_size=None, **kwargs):
+    explainer = self.get_explainer(method, T, X, **kwargs)
+    return explainer.run(xs, ys, batch_size)
+
+  @staticmethod
+  def get_override_map():
+    return dict((a, 'DeepExplainGrad') for a in SUPPORTED_ACTIVATIONS)
+
+  def _check_ops(self):
+    """Heuristically check if any op is in the list of unsupported activation functions.
+
+    This does not cover all cases where explanation methods would fail, and must be improved in the future.
+    Also, check if the placeholder named 'keras_learning_phase' exists in the graph. This is used by Keras
+     and needs to be passed in feed_dict.
+    :return:
+    """
+    g = tf.get_default_graph()
+    for op in g.get_operations():
+      if len(op.inputs) > 0 and not op.name.startswith('gradients'):
+        if op.type in UNSUPPORTED_ACTIVATIONS:
+          warnings.warn('Detected unsupported activation (%s). '
+                        'This might lead to unexpected or wrong results.' %
+                        op.type)
+      elif 'keras_learning_phase' in op.name:
+        self.keras_phase_placeholder = op.outputs[0]
diff --git a/easy_rec/python/tools/explainer/utils.py b/easy_rec/python/tools/explainer/utils.py
index b697bf230..574d067a8 100644
--- a/easy_rec/python/tools/explainer/utils.py
+++ b/easy_rec/python/tools/explainer/utils.py
@@ -7,63 +7,64 @@
 
 
 def make_batches(size, batch_size):
-    """Returns a list of batch indices (tuples of indices).
-    # Arguments
-        size: Integer, total size of the data to slice into batches.
-        batch_size: Integer, batch size.
-    # Returns
-        A list of tuples of array indices.
-    """
-    num_batches = (size + batch_size - 1) // batch_size  # round up
-    return [(i * batch_size, min(size, (i + 1) * batch_size))
-            for i in range(num_batches)]
+  """Returns a list of batch indices (tuples of indices).
+
+  # Arguments
+      size: Integer, total size of the data to slice into batches.
+      batch_size: Integer, batch size.
+  # Returns
+      A list of tuples of array indices.
+  """
+  num_batches = (size + batch_size - 1) // batch_size  # round up
+  return [(i * batch_size, min(size, (i + 1) * batch_size))
+          for i in range(num_batches)]
 
 
 def to_list(x, allow_tuple=False):
-    """Normalizes a list/tensor into a list.
-    If a tensor is passed, we return
-    a list of size 1 containing the tensor.
-    # Arguments
-        x: target object to be normalized.
-        allow_tuple: If False and x is a tuple,
-            it will be converted into a list
-            with a single element (the tuple).
-            Else converts the tuple to a list.
-    # Returns
-        A list.
-    """
-    if isinstance(x, list):
-        return x
-    if allow_tuple and isinstance(x, tuple):
-        return list(x)
-    return [x]
+  """Normalizes a list/tensor into a list. If a tensor is passed, we return a list of size 1 containing the tensor.
+
+  # Arguments
+      x: target object to be normalized.
+      allow_tuple: If False and x is a tuple,
+          it will be converted into a list
+          with a single element (the tuple).
+          Else converts the tuple to a list.
+  # Returns
+      A list.
+  """
+  if isinstance(x, list):
+    return x
+  if allow_tuple and isinstance(x, tuple):
+    return list(x)
+  return [x]
 
 
 def unpack_singleton(x):
-    """Gets the equivalent np-array if the iterable has only one value.
-    Otherwise return the iterable.
-    # Argument
-        x: A list or tuple.
-    # Returns
-        The same iterable or the iterable converted to a np-array.
-    """
-    if len(x) == 1:
-        return np.array(x)
-    return x
+  """Gets the equivalent np-array if the iterable has only one value. Otherwise return the iterable.
+
+  # Argument
+      x: A list or tuple.
+  # Returns
+      The same iterable or the iterable converted to a np-array.
+  """
+  if len(x) == 1:
+    return np.array(x)
+  return x
 
 
 def slice_arrays(arrays, start=None, stop=None):
-    """Slices an array or list of arrays.
-    """
-    if arrays is None:
-        return [None]
-    elif isinstance(arrays, list):
-        return [None if x is None else x[start:stop] for x in arrays]
-    else:
-        return arrays[start:stop]
+  """Slices an array or list of arrays."""
+  if arrays is None:
+    return [None]
+  elif isinstance(arrays, list):
+    return [None if x is None else x[start:stop] for x in arrays]
+  else:
+    return arrays[start:stop]
 
 
 def placeholder_from_data(numpy_array):
-    if numpy_array is None:
-        return None
-    return tf.placeholder('float', [None,] + list(numpy_array.shape[1:]))
+  if numpy_array is None:
+    return None
+  return tf.placeholder('float', [
+      None,
+  ] + list(numpy_array.shape[1:]))
diff --git a/easy_rec/python/utils/activation.py b/easy_rec/python/utils/activation.py
index f52a012ae..89044f7a3 100644
--- a/easy_rec/python/utils/activation.py
+++ b/easy_rec/python/utils/activation.py
@@ -57,7 +57,7 @@ def gelu(x, name='gelu'):
   """
   with tf.name_scope(name):
     cdf = 0.5 * (1.0 + tf.tanh(
-      (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
 
diff --git a/setup.cfg b/setup.cfg
index b180b9fb1..82650a70f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ multi_line_output = 7
 force_single_line = true
 known_standard_library = setuptools
 known_first_party = easy_rec
-known_third_party = absl,common_io,docutils,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
+known_third_party = absl,common_io,docutils,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,skimage,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
 no_lines_before = LOCALFOLDER
 default_section = THIRDPARTY
 skip = easy_rec/python/protos

From 8509174b4346c5c1b4e87fc5d6799272a28d29ff Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 8 May 2023 20:08:32 +0800
Subject: [PATCH 23/54] [feat]: add const feature column

---
 easy_rec/python/feature_column/feature_column.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/easy_rec/python/feature_column/feature_column.py b/easy_rec/python/feature_column/feature_column.py
index 8f4a88913..1f62faef1 100644
--- a/easy_rec/python/feature_column/feature_column.py
+++ b/easy_rec/python/feature_column/feature_column.py
@@ -423,8 +423,11 @@ def parse_const_feature(self, config):
     """
     feature_name = config.feature_name if config.HasField('feature_name') \
         else config.input_names[0]
+    dim = config.raw_input_dim
+    if config.HasField('embedding_dim'):
+      dim = config.embedding_dim
     fc = feature_column.constant_numeric_column(
-        feature_name, shape=(config.embedding_dim,), feature_name=feature_name)
+        feature_name, shape=(dim,), feature_name=feature_name)
     if self.is_wide(config):
       self._wide_columns[feature_name] = fc
     if self.is_deep(config):

From eba4219c82d784cb48e32aa17d34ab3ed6d4a366 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 9 May 2023 14:56:30 +0800
Subject: [PATCH 24/54] [feat]: add feature selection tool

---
 easy_rec/python/compat/sort_ops.py         | 217 +++++++++++++++++++++
 easy_rec/python/input/input.py             |  10 +-
 easy_rec/python/layers/fscd_layer.py       |  66 ++++---
 easy_rec/python/tools/feature_selection.py |  93 +++++++++
 easy_rec/python/utils/tf_utils.py          |  13 ++
 5 files changed, 363 insertions(+), 36 deletions(-)
 create mode 100644 easy_rec/python/compat/sort_ops.py

diff --git a/easy_rec/python/compat/sort_ops.py b/easy_rec/python/compat/sort_ops.py
new file mode 100644
index 000000000..f7c5bf3a5
--- /dev/null
+++ b/easy_rec/python/compat/sort_ops.py
@@ -0,0 +1,217 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for sorting tensors.
+
+@@argsort
+@@sort
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('sort')
+def sort(values, axis=-1, direction='ASCENDING', name=None):
+  """Sorts a tensor.
+
+  Usage:
+
+  ```python
+  import tensorflow as tf
+  a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+  b = tf.sort(a,axis=-1,direction='ASCENDING',name=None)
+  c = tf.keras.backend.eval(b)
+  # Here, c = [  1.     2.8   10.    26.9   62.3  166.32]
+  ```
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same dtype and shape as `values`, with the elements
+        sorted along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  with framework_ops.name_scope(name, 'sort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=False)
+
+
+@tf_export('argsort')
+def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
+  """Returns the indices of a tensor that give its sorted order along an axis.
+
+  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
+  `tf.sort(values)`. For higher dimensions, the output has the same shape as
+  `values`, but along the given axis, values represent the index of the sorted
+  element in that slice of the tensor at the given position.
+
+  Usage:
+
+  ```python
+  import tensorflow as tf
+  a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+  b = tf.argsort(a,axis=-1,direction='ASCENDING',stable=False,name=None)
+  c = tf.keras.backend.eval(b)
+  # Here, c = [0 3 1 2 5 4]
+  ```
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    stable: If True, equal elements in the original tensor will not be
+      re-ordered in the returned order. Unstable sort is not yet implemented,
+      but will eventually be the default for performance reasons. If you require
+      a stable order, pass `stable=True` for forwards compatibility.
+    name: Optional name for the operation.
+
+  Returns:
+    An int32 `Tensor` with the same shape as `values`. The indices that would
+        sort each slice of the given `values` along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  del stable  # Unused.
+  with framework_ops.name_scope(name, 'argsort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=True)
+
+
+def _sort_or_argsort(values, axis, direction, return_argsort):
+  """Internal sort/argsort implementation.
+
+  Args:
+    values: The input values.
+    axis: The axis along which to sort.
+    direction: 'ASCENDING' or 'DESCENDING'.
+    return_argsort: Whether to return the argsort result.
+
+  Returns:
+    Either the sorted values, or the indices of the sorted values in the
+        original tensor. See the `sort` and `argsort` docstrings.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  if direction not in _SORT_IMPL:
+    raise ValueError('%s should be one of %s' % (direction, ', '.join(
+      sorted(_SORT_IMPL.keys()))))
+  # Axis must be an integer, not a Tensor.
+  axis = framework_ops.convert_to_tensor(axis, name='axis')
+  axis_static = tensor_util.constant_value(axis)
+  if axis.shape.ndims != 0 or axis_static is None:
+    raise ValueError('axis must be a constant scalar')
+  axis_static = int(axis_static)  # Avoids NumPy casting error
+
+  values = framework_ops.convert_to_tensor(values, name='values')
+
+  return _SORT_IMPL[direction](values, axis_static, return_argsort)
+
+
+def _descending_sort(values, axis, return_argsort=False):
+  """Sorts values in reverse using `top_k`.
+
+  Args:
+    values: Tensor of numeric values.
+    axis: Index of the axis which values should be sorted along.
+    return_argsort: If False, return the sorted values. If True, return the
+      indices that would sort the values.
+
+  Returns:
+    The sorted values.
+  """
+  k = array_ops.shape(values)[axis]
+  rank = array_ops.rank(values)
+  static_rank = values.shape.ndims
+  # Fast path: sorting the last axis.
+  if axis == -1 or axis + 1 == values.get_shape().ndims:
+    top_k_input = values
+    transposition = None
+  else:
+    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
+    if axis < 0:
+      # Calculate the actual axis index if counting from the end. Use the static
+      # rank if available, or else make the axis back into a tensor.
+      axis += static_rank or rank
+    if static_rank is not None:
+      # Prefer to calculate the transposition array in NumPy and make it a
+      # constant.
+      transposition = constant_op.constant(
+        np.r_[
+          # Axes up to axis are unchanged.
+          np.arange(axis),
+          # Swap axis and rank - 1.
+          [static_rank - 1],
+          # Axes in [axis + 1, rank - 1) are unchanged.
+          np.arange(axis + 1, static_rank - 1),
+          # Swap axis and rank - 1.
+          [axis]],
+        name='transposition')
+    else:
+      # Generate the transposition array from the tensors.
+      transposition = array_ops.concat(
+        [
+          # Axes up to axis are unchanged.
+          math_ops.range(axis),
+          # Swap axis and rank - 1.
+          [rank - 1],
+          # Axes in [axis + 1, rank - 1) are unchanged.
+          math_ops.range(axis + 1, rank - 1),
+          # Swap axis and rank - 1.
+          [axis]
+        ],
+        axis=0)
+    top_k_input = array_ops.transpose(values, transposition)
+
+  values, indices = nn_ops.top_k(top_k_input, k)
+  return_value = indices if return_argsort else values
+  if transposition is not None:
+    # transposition contains a single cycle of length 2 (swapping 2 elements),
+    # so it is an involution (it is its own inverse).
+    return_value = array_ops.transpose(return_value, transposition)
+  return return_value
+
+
+def _ascending_sort(values, axis, return_argsort=False):
+  # Negate the values to get the ascending order from descending sort.
+  values_or_indices = _descending_sort(-values, axis, return_argsort)
+  # If not argsort, negate the values again.
+  return values_or_indices if return_argsort else -values_or_indices
+
+
+_SORT_IMPL = {
+    'ASCENDING': _ascending_sort,
+    'DESCENDING': _descending_sort,
+}
diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index d4a990c35..686355ac0 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -18,6 +18,7 @@
 from easy_rec.python.utils.input_utils import get_type_defaults
 from easy_rec.python.utils.load_class import get_register_class_meta
 from easy_rec.python.utils.load_class import load_by_path
+from easy_rec.python.utils.tf_utils import get_config_type
 from easy_rec.python.utils.tf_utils import get_tf_type
 
 if tf.__version__ >= '2.0':
@@ -280,8 +281,9 @@ def create_multi_placeholders(self, export_config):
         logging.info('multi value input_name: %s, dtype: %s' %
                      (input_name, tf_type))
         if input_name in erase_features:
-          def_val = self.get_type_defaults(tf_type, self._input_field_defaults[fid])
-          finput = tf.placeholder_with_default(def_val, [None, None], name=placeholder_name)
+          conf_type = get_config_type(tf_type)
+          def_val = self.get_type_defaults(conf_type, self._input_field_defaults[fid])
+          finput = tf.placeholder_with_default([def_val], [None, None], name=placeholder_name)
         else:
           finput = tf.placeholder(tf_type, [None, None], name=placeholder_name)
       else:
@@ -289,8 +291,8 @@ def create_multi_placeholders(self, export_config):
         tf_type = get_tf_type(ftype)
         logging.info('input_name: %s, dtype: %s' % (input_name, tf_type))
         if input_name in erase_features:
-          def_val = self.get_type_defaults(tf_type, self._input_field_defaults[fid])
-          finput = tf.placeholder_with_default(def_val, [None], name=placeholder_name)
+          def_val = self.get_type_defaults(ftype, self._input_field_defaults[fid])
+          finput = tf.placeholder_with_default([def_val], [None], name=placeholder_name)
         else:
           finput = tf.placeholder(tf_type, [None], name=placeholder_name)
       inputs[input_name] = finput
diff --git a/easy_rec/python/layers/fscd_layer.py b/easy_rec/python/layers/fscd_layer.py
index 78849f162..a99e8aa4b 100644
--- a/easy_rec/python/layers/fscd_layer.py
+++ b/easy_rec/python/layers/fscd_layer.py
@@ -1,14 +1,16 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+from collections import OrderedDict
 import math
 import json
 import numpy as np
-import six
 import tensorflow as tf
 from tensorflow.python.framework.meta_graph import read_meta_graph_file
 from easy_rec.python.compat.feature_column.feature_column import _SharedEmbeddingColumn  # NOQA
 from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn  # NOQA
 from easy_rec.python.compat.feature_column.feature_column_v2 import SharedEmbeddingColumn  # NOQA
+from easy_rec.python.compat.sort_ops import argsort
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -28,7 +30,7 @@ def sigmoid(x):
   return 1. / (1. + math.exp(-x))
 
 
-def get_top_and_bottom_features(pipeline_config, top_k):
+def get_feature_importance(pipeline_config, feature_group_name=None):
   assert pipeline_config.model_config.HasField(
     'variational_dropout'), 'variational_dropout must be in model_config'
 
@@ -41,29 +43,50 @@ def get_top_and_bottom_features(pipeline_config, top_k):
     features = json.loads(col_def)
     features_map.update(features)
 
-  top_features = set()
+  feature_importance = OrderedDict()
   tf.logging.info('Reading checkpoint from %s ...' % checkpoint_path)
   reader = tf.train.NewCheckpointReader(checkpoint_path)
   for feature_group in pipeline_config.model_config.feature_groups:
     group_name = feature_group.group_name
-    delta_name = 'fscd_delta_%s' % group_name
-    if not reader.has_tensor(delta_name):
+    if feature_group_name is not None and feature_group_name != group_name:
       continue
     assert group_name in features_map, "%s not in feature map" % group_name
     feature_dims = features_map[group_name]
+
+    delta_name = 'fscd_delta_%s' % group_name
+    if not reader.has_tensor(delta_name):
+      logging.warn("feature group `%s` doesn't be involved in FSCD layer")
+      for feature, dim in feature_dims:
+        feature_importance[feature] = 1.0
+      continue
+
     delta = reader.get_tensor(delta_name)
-    values, indices = tf.nn.top_k(delta, top_k)
+    indices = argsort(delta, direction='DESCENDING')
+    keep_prob = tf.nn.sigmoid(delta)
     with tf.Session() as sess:
       idx = indices.eval(session=sess)
+      probs = keep_prob.eval(session=sess)
     for i in idx:
       feature = feature_dims[i][0]
-      top_features.add(feature)
+      if feature in feature_importance:
+        raw = feature_importance[feature]
+        if probs[i] > raw:
+          logging.info("%s importance change from %d to %d", feature, raw, probs[i])
+          feature_importance[feature] = probs[i]
+      else:
+        feature_importance[feature] = probs[i]
+  return feature_importance
 
+
+def get_top_and_bottom_features(pipeline_config, top_k):
+  feature_score = get_feature_importance(pipeline_config)
+  top_features = set()
   bottom_features = set()
-  for group_name, features in six.iteritems(features_map):
-    for name, dim in features:
-      if name not in top_features:
-        bottom_features.add(name)
+  for feature, score in feature_score.iteritems():
+    if len(top_features) < top_k:
+      top_features.add(feature)
+    else:
+      bottom_features.add(feature)
 
   print("selected top %d features:" % top_k, ','.join(top_features))
   print("removed bottom features:", ','.join(bottom_features))
@@ -127,31 +150,10 @@ def compute_regular_params(self, cols_to_feature):
             "dimension:", dim, "c:", c, "theta:", theta, "alpha:", alpha)
     return alphas
 
-  # def mask_bottom_features(self, cols_to_feature, top_k):
-  #   feature_map = tf.get_collection('variational_dropout')
-  #   features = feature_map[self.name]
-  #
-  #   delta_name = 'fscd_delta_%s' % self.name
-  #   graph = tf.get_default_graph()
-  #   delta = graph.get_tensor_by_name(delta_name)
-  #   values, indices = tf.nn.top_k(delta, top_k)
-  #
-  #   output_tensors = []
-  #   feature_columns = cols_to_feature.keys()
-  #   for column in sorted(feature_columns, key=lambda x: x.name):
-  #     value = cols_to_feature[column]
-  #     output_tensors.append(value)
-  #   return tf.concat(output_tensors, 1)
-
   def __call__(self, cols_to_feature):
     """
     cols_to_feature: an ordered dict mapping feature_column to feature_values
     """
-    # if self._config.HasField('fine_tune_use_top_k_features'):
-    #   k = self._config.fine_tune_use_top_k_features
-    #   assert k > 0, 'config `fine_tune_use_top_k_features` must be large than 0'
-    #   return self.mask_bottom_features(cols_to_feature, k)
-
     feature_dimension = []
     output_tensors = []
     alphas = []
diff --git a/easy_rec/python/tools/feature_selection.py b/easy_rec/python/tools/feature_selection.py
index 05b193897..cbe717351 100644
--- a/easy_rec/python/tools/feature_selection.py
+++ b/easy_rec/python/tools/feature_selection.py
@@ -294,6 +294,90 @@ def _visualize_feature_importance(self, feature_importance, group_name):
       plt.savefig(f, format='png')
 
 
+class FSCD(object):
+  def __init__(self,
+               config_path,
+               output_dir,
+               topk,
+               checkpoint_path=None,
+               fg_path=None,
+               visualize=False):
+    self._config_path = config_path
+    self._output_dir = output_dir
+    self._topk = topk
+    if not tf.gfile.Exists(self._output_dir):
+      tf.gfile.MakeDirs(self._output_dir)
+    self._checkpoint_path = checkpoint_path
+    self._fg_path = fg_path
+    self._visualize = visualize
+
+  def process(self):
+    tf.logging.info('Loading delta of FSCD layer ...')
+    config = config_util.get_configs_from_pipeline_file(self._config_path)
+    assert config.model_config.HasField(
+        'variational_dropout'), 'variational_dropout must be in model_config'
+
+    feature_importance_map = {}
+    from easy_rec.python.layers.fscd_layer import get_feature_importance
+    for feature_group in config.model_config.feature_groups:
+      group_name = feature_group.group_name
+      tf.logging.info('Calculating %s feature importance ...' % group_name)
+      feature_importance = get_feature_importance(config, group_name)
+      feature_importance_map[group_name] = feature_importance
+
+      tf.logging.info('Dump %s  feature importance to csv ...' % group_name)
+      self._dump_to_csv(feature_importance, group_name)
+
+      if self._visualize:
+        tf.logging.info('Visualizing %s feature importance ...' % group_name)
+        self._visualize_feature_importance(feature_importance, group_name)
+
+    tf.logging.info('Processing model config ...')
+    self._process_config(feature_importance_map)
+
+  def _dump_to_csv(self, feature_importance, group_name):
+    """Dump feature importance data to a csv file."""
+    with tf.gfile.Open(
+        os.path.join(self._output_dir,
+                     'feature_importance_%s.csv' % group_name), 'w') as f:
+      df = pd.DataFrame(
+          columns=['feature_name', 'importance'],
+          data=[list(kv) for kv in feature_importance.items()])
+      df.to_csv(f, encoding='gbk')
+
+  def _visualize_feature_importance(self, feature_importance, group_name):
+    """Draw feature importance histogram."""
+    df = pd.DataFrame(
+        columns=['feature_name', 'importance'],
+        data=[list(kv) for kv in feature_importance.items()])
+    df['color'] = ['red' if x < 0.5 else 'green' for x in df['importance']]
+    df.sort_values('importance', inplace=True, ascending=False)
+    df.reset_index(inplace=True)
+    # Draw plot
+    plt.figure(figsize=(90, 200), dpi=100)
+    plt.hlines(y=df.index, xmin=0, xmax=df.mean_drop_p)
+    for x, y, tex in zip(df.mean_drop_p, df.index, df.mean_drop_p):
+      plt.text(
+          x,
+          y,
+          round(tex, 2),
+          horizontalalignment='right' if x < 0 else 'left',
+          verticalalignment='center',
+          fontdict={
+              'color': 'red' if x < 0 else 'green',
+              'size': 14
+          })
+    # Decorations
+    plt.yticks(df.index, df.feature_name, fontsize=20)
+    plt.title('Feature Importance', fontdict={'size': 30})
+    plt.grid(linestyle='--', alpha=0.5)
+    plt.xlim(0, 1)
+    with tf.gfile.GFile(
+        os.path.join(self._output_dir,
+                     'feature_importance_pic_%s.png' % group_name), 'wb') as f:
+      plt.savefig(f, format='png')
+
+
 if __name__ == '__main__':
   if FLAGS.model_type == 'variational_dropout':
     fs = VariationalDropoutFS(
@@ -304,6 +388,15 @@ def _visualize_feature_importance(self, feature_importance, group_name):
         fg_path=FLAGS.fg_path,
         visualize=FLAGS.visualize)
     fs.process()
+  elif FLAGS.model_type == 'fscd':
+    fs = FSCD(
+      FLAGS.config_path,
+      FLAGS.output_dir,
+      FLAGS.topk,
+      checkpoint_path=FLAGS.checkpoint_path,
+      fg_path=FLAGS.fg_path,
+      visualize=FLAGS.visualize)
+    fs.process()
   else:
     raise ValueError('Unknown feature selection model type %s' %
                      FLAGS.model_type)
diff --git a/easy_rec/python/utils/tf_utils.py b/easy_rec/python/utils/tf_utils.py
index 20e19496c..e1026c132 100644
--- a/easy_rec/python/utils/tf_utils.py
+++ b/easy_rec/python/utils/tf_utils.py
@@ -33,3 +33,16 @@ def get_col_type(tf_type):
   }
   assert tf_type in type_map, 'invalid type: %s' % tf_type
   return type_map[tf_type]
+
+
+def get_config_type(tf_type):
+  type_map = {
+      tf.int32: DatasetConfig.INT32,
+      tf.int64: DatasetConfig.INT64,
+      tf.string: DatasetConfig.STRING,
+      tf.bool: DatasetConfig.BOOL,
+      tf.float32: DatasetConfig.FLOAT,
+      tf.double: DatasetConfig.DOUBLE
+  }
+  assert tf_type in type_map, 'invalid type: %s' % tf_type
+  return type_map[tf_type]

From c27c1d88bc7adaf1ddc8239454d5c6fc5bdf61ff Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Thu, 11 May 2023 20:12:41 +0800
Subject: [PATCH 25/54] [feat]: add feature selection tool

---
 easy_rec/python/builders/loss_builder.py      | 11 ++-
 .../feature_column/feature_column_v2.py       | 31 ++-----
 easy_rec/python/compat/sort_ops.py            | 47 +++++-----
 easy_rec/python/inference/predictor.py        |  3 +
 easy_rec/python/input/input.py                | 57 ++++++++----
 easy_rec/python/layers/fscd_layer.py          | 65 +++++++------
 easy_rec/python/loss/jrc_loss.py              | 13 ++-
 easy_rec/python/tools/feature_selection.py    | 93 ++++++++++++++++---
 easy_rec/python/tools/view_saved_model.py     | 38 ++++++++
 setup.cfg                                     |  2 +-
 10 files changed, 255 insertions(+), 105 deletions(-)
 create mode 100644 easy_rec/python/tools/view_saved_model.py

diff --git a/easy_rec/python/builders/loss_builder.py b/easy_rec/python/builders/loss_builder.py
index 7459372a5..e1b32fde1 100644
--- a/easy_rec/python/builders/loss_builder.py
+++ b/easy_rec/python/builders/loss_builder.py
@@ -42,11 +42,16 @@ def build(loss_type,
         labels=label, predictions=pred, weights=loss_weight, **kwargs)
   elif loss_type == LossType.JRC_LOSS:
     alpha = 0.5 if loss_param is None else loss_param.alpha
-    auto_weight = False if loss_param is None else not loss_param.HasField(
-        'alpha')
+    auto = False if loss_param is None else not loss_param.HasField('alpha')
     session = kwargs.get('session_ids', None)
     return jrc_loss(
-        label, pred, session, alpha, auto_weight=auto_weight, name=loss_name)
+        label,
+        pred,
+        session,
+        alpha,
+        auto_weight=auto,
+        sample_weights=loss_weight,
+        name=loss_name)
   elif loss_type == LossType.PAIR_WISE_LOSS:
     session = kwargs.get('session_ids', None)
     margin = 0 if loss_param is None else loss_param.margin
diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py
index a17ce8fdc..578b0a50a 100644
--- a/easy_rec/python/compat/feature_column/feature_column_v2.py
+++ b/easy_rec/python/compat/feature_column/feature_column_v2.py
@@ -1329,11 +1329,10 @@ def numeric_column(key,
 
 
 def constant_numeric_column(key,
-                   shape=(1,),
-                   default_value=None,
-                   dtype=dtypes.float32,
-                   normalizer_fn=None,
-                   feature_name=None):
+                            shape=(1,),
+                            default_value=None,
+                            dtype=dtypes.float32,
+                            feature_name=None):
   """Represents real valued or numerical features.
 
   Example:
@@ -1368,12 +1367,6 @@ def constant_numeric_column(key,
       the shape of the `default_value` should be equal to the given `shape`.
     dtype: defines the type of values. Default value is `tf.float32`. Must be a
       non-quantized, real integer or floating point type.
-    normalizer_fn: If not `None`, a function that can be used to normalize the
-      value of the tensor after `default_value` is applied for parsing.
-      Normalizer function takes the input `Tensor` as its argument, and returns
-      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
-      even though the most common use case of this function is normalization, it
-      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `NumericColumn`.
@@ -1391,18 +1384,13 @@ def constant_numeric_column(key,
                      'dtype: {}, key: {}'.format(dtype, key))
   default_value = fc_utils.check_default_value(shape, default_value, dtype, key)
 
-  if normalizer_fn is not None and not callable(normalizer_fn):
-    raise TypeError(
-        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
-
   fc_utils.assert_key_is_string(key)
   return ConstantNumericColumn(
       feature_name=feature_name,
       key=key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype,
-      normalizer_fn=normalizer_fn)
+      dtype=dtype)
 
 
 def bucketized_column(source_column, boundaries):
@@ -2701,7 +2689,7 @@ class ConstantNumericColumn(
     fc_old._DenseColumn,  # pylint: disable=protected-access
     collections.namedtuple('ConstantNumericColumn',
                            ('feature_name', 'key', 'shape', 'default_value',
-                            'dtype', 'normalizer_fn'))):
+                            'dtype'))):
   """see `numeric_column`."""
 
   @property
@@ -2734,8 +2722,11 @@ def _parse_example_spec(self):
     return self.parse_example_spec
 
   def _transform_input_tensor(self, input_tensor):
+    shape = [1] + list(self.shape)
     def_val = 0 if self.default_value is None else self.default_value
-    return tf.constant(def_val, dtypes.float32, self.shape)
+    row = tf.constant(def_val, dtypes.float32, shape)
+    batch_size = tf.shape(input_tensor)[0]
+    return tf.tile(row, [batch_size, 1])
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
                           _FEATURE_COLUMN_DEPRECATION)
@@ -2746,8 +2737,6 @@ def _transform_feature(self, inputs):
   def transform_feature(self, transformation_cache, state_manager):
     """See `FeatureColumn` base class.
 
-    In this case, we apply the `normalizer_fn` to the input tensor.
-
     Args:
       transformation_cache: A `FeatureTransformationCache` object to access
         features.
diff --git a/easy_rec/python/compat/sort_ops.py b/easy_rec/python/compat/sort_ops.py
index f7c5bf3a5..bd7f92ab1 100644
--- a/easy_rec/python/compat/sort_ops.py
+++ b/easy_rec/python/compat/sort_ops.py
@@ -23,7 +23,6 @@
 from __future__ import print_function
 
 import numpy as np
-
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.framework import tensor_util
@@ -126,8 +125,8 @@ def _sort_or_argsort(values, axis, direction, return_argsort):
     ValueError: If axis is not a constant scalar, or the direction is invalid.
   """
   if direction not in _SORT_IMPL:
-    raise ValueError('%s should be one of %s' % (direction, ', '.join(
-      sorted(_SORT_IMPL.keys()))))
+    raise ValueError('%s should be one of %s' %
+                     (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
   # Axis must be an integer, not a Tensor.
   axis = framework_ops.convert_to_tensor(axis, name='axis')
   axis_static = tensor_util.constant_value(axis)
@@ -169,30 +168,30 @@ def _descending_sort(values, axis, return_argsort=False):
       # Prefer to calculate the transposition array in NumPy and make it a
       # constant.
       transposition = constant_op.constant(
-        np.r_[
-          # Axes up to axis are unchanged.
-          np.arange(axis),
-          # Swap axis and rank - 1.
-          [static_rank - 1],
-          # Axes in [axis + 1, rank - 1) are unchanged.
-          np.arange(axis + 1, static_rank - 1),
-          # Swap axis and rank - 1.
-          [axis]],
-        name='transposition')
+          np.r_[
+              # Axes up to axis are unchanged.
+              np.arange(axis),
+              # Swap axis and rank - 1.
+              [static_rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              np.arange(axis + 1, static_rank - 1),
+              # Swap axis and rank - 1.
+              [axis]],
+          name='transposition')
     else:
       # Generate the transposition array from the tensors.
       transposition = array_ops.concat(
-        [
-          # Axes up to axis are unchanged.
-          math_ops.range(axis),
-          # Swap axis and rank - 1.
-          [rank - 1],
-          # Axes in [axis + 1, rank - 1) are unchanged.
-          math_ops.range(axis + 1, rank - 1),
-          # Swap axis and rank - 1.
-          [axis]
-        ],
-        axis=0)
+          [
+              # Axes up to axis are unchanged.
+              math_ops.range(axis),
+              # Swap axis and rank - 1.
+              [rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              math_ops.range(axis + 1, rank - 1),
+              # Swap axis and rank - 1.
+              [axis]
+          ],
+          axis=0)
     top_k_input = array_ops.transpose(values, transposition)
 
   values, indices = nn_ops.top_k(top_k_input, k)
diff --git a/easy_rec/python/inference/predictor.py b/easy_rec/python/inference/predictor.py
index dba53f967..e39592c18 100644
--- a/easy_rec/python/inference/predictor.py
+++ b/easy_rec/python/inference/predictor.py
@@ -222,6 +222,9 @@ def _build_model(self):
               logging.info('Load input binding: %s -> %s' % (name, tensor.name))
               input_name = tensor.name
               input_name, _ = input_name.split(':')
+              input_op = self._graph.get_operation_by_name(input_name)
+              if input_op.type == "PlaceholderWithDefault":
+                continue
               try:
                 input_id = input_name.split('_')[-1]
                 input_id = int(input_id)
diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index 686355ac0..2775ad1ac 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -94,12 +94,14 @@ def __init__(self,
     # from the types defined in input_fields
     # it is used in create_multi_placeholders
     self._multi_value_types = {}
-
+    self._const_features = set()
     self._normalizer_fn = {}
     for fc in self._feature_configs:
       for input_name in fc.input_names:
         assert input_name in self._input_fields, 'invalid input_name in %s' % str(
             fc)
+        if fc.feature_type == fc.ConstFeature:
+          self._const_features.add(input_name)
         if input_name not in self._effective_fields:
           self._effective_fields.append(input_name)
 
@@ -227,17 +229,17 @@ def should_stop(self, curr_epoch):
     return total_epoch is not None and curr_epoch >= total_epoch
 
   def get_erase_features(self):
-    if self._pipeline_config is None:
-      return set()
+    if len(self._const_features) == 0:
+      return self._const_features
 
-    config = self._pipeline_config.model_config.variational_dropout
-    if config is None:
-      return set()
+    for fc in self._feature_configs:
+      if fc.feature_type == fc.ConstFeature:
+        continue
+      for input_name in fc.input_names:
+        if input_name in self._const_features:
+          self._const_features.remove(input_name)
 
-    top_k = config.fine_tune_use_top_k_features
-    from easy_rec.python.layers.fscd_layer import get_top_and_bottom_features
-    _, erase_features = get_top_and_bottom_features(self._pipeline_config, top_k)
-    return erase_features
+    return self._const_features
 
   def create_multi_placeholders(self, export_config):
     """Create multiply placeholders on export, one for each feature.
@@ -282,8 +284,10 @@ def create_multi_placeholders(self, export_config):
                      (input_name, tf_type))
         if input_name in erase_features:
           conf_type = get_config_type(tf_type)
-          def_val = self.get_type_defaults(conf_type, self._input_field_defaults[fid])
-          finput = tf.placeholder_with_default([def_val], [None, None], name=placeholder_name)
+          def_val = self.get_type_defaults(conf_type,
+                                           self._input_field_defaults[fid])
+          finput = tf.placeholder_with_default([def_val], [None, None],
+                                               name=placeholder_name)
         else:
           finput = tf.placeholder(tf_type, [None, None], name=placeholder_name)
       else:
@@ -291,8 +295,10 @@ def create_multi_placeholders(self, export_config):
         tf_type = get_tf_type(ftype)
         logging.info('input_name: %s, dtype: %s' % (input_name, tf_type))
         if input_name in erase_features:
-          def_val = self.get_type_defaults(ftype, self._input_field_defaults[fid])
-          finput = tf.placeholder_with_default([def_val], [None], name=placeholder_name)
+          def_val = self.get_type_defaults(ftype,
+                                           self._input_field_defaults[fid])
+          finput = tf.placeholder_with_default([def_val], [None],
+                                               name=placeholder_name)
         else:
           finput = tf.placeholder(tf_type, [None], name=placeholder_name)
       inputs[input_name] = finput
@@ -500,10 +506,19 @@ def _parse_id_feature(self, fc, parsed_dict, field_dict):
               tf.int32,
               name='%s_str_2_int' % input_0)
 
-  def _parse_const_feature(self, fc, parsed_dict, field_dict):
+  def _parse_const_feature(self, fc, parsed_dict, field_dict, batch_size):
     input_0 = fc.input_names[0]
+    input_tensor = field_dict[input_0]
+
+    def expand_input():
+      multiples = [1] * input_tensor.shape.ndims
+      multiples[0] = batch_size
+      return tf.tile(input_tensor, multiples)
+
+    input_tensor = tf.cond(tf.equal(tf.shape(input_tensor)[0], batch_size),
+                           lambda: input_tensor, expand_input)
     feature_name = fc.feature_name if fc.HasField('feature_name') else input_0
-    parsed_dict[feature_name] = field_dict[input_0]
+    parsed_dict[feature_name] = input_tensor
 
   def _parse_raw_feature(self, fc, parsed_dict, field_dict):
     input_0 = fc.input_names[0]
@@ -795,6 +810,14 @@ def _preprocess(self, field_dict):
           parsed_dict[k] = v
           self._appended_fields.append(k)
 
+    batch_size = 1
+    for fc in self._feature_configs:
+      feature_type = fc.feature_type
+      if feature_type != fc.ConstFeature:
+        input_0 = fc.input_names[0]
+        batch_size = tf.shape(field_dict[input_0])[0]
+        break
+
     for fc in self._feature_configs:
       feature_name = fc.feature_name
       feature_type = fc.feature_type
@@ -813,7 +836,7 @@ def _preprocess(self, field_dict):
       elif feature_type == fc.ExprFeature:
         self._parse_expr_feature(fc, parsed_dict, field_dict)
       elif feature_type == fc.ConstFeature:
-        self._parse_const_feature(fc, parsed_dict, field_dict)
+        self._parse_const_feature(fc, parsed_dict, field_dict, batch_size)
       else:
         feature_name = fc.feature_name if fc.HasField(
             'feature_name') else fc.input_names[0]
diff --git a/easy_rec/python/layers/fscd_layer.py b/easy_rec/python/layers/fscd_layer.py
index a99e8aa4b..163cf18f7 100644
--- a/easy_rec/python/layers/fscd_layer.py
+++ b/easy_rec/python/layers/fscd_layer.py
@@ -1,16 +1,19 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import json
 import logging
-from collections import OrderedDict
 import math
-import json
+from collections import OrderedDict
+
 import numpy as np
 import tensorflow as tf
 from tensorflow.python.framework.meta_graph import read_meta_graph_file
+
+from easy_rec.python.compat.sort_ops import argsort
+
 from easy_rec.python.compat.feature_column.feature_column import _SharedEmbeddingColumn  # NOQA
 from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn  # NOQA
 from easy_rec.python.compat.feature_column.feature_column_v2 import SharedEmbeddingColumn  # NOQA
-from easy_rec.python.compat.sort_ops import argsort
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -32,14 +35,14 @@ def sigmoid(x):
 
 def get_feature_importance(pipeline_config, feature_group_name=None):
   assert pipeline_config.model_config.HasField(
-    'variational_dropout'), 'variational_dropout must be in model_config'
+      'variational_dropout'), 'variational_dropout must be in model_config'
 
   checkpoint_path = tf.train.latest_checkpoint(pipeline_config.model_dir)
   meta_graph_def = read_meta_graph_file(checkpoint_path + '.meta')
 
   features_map = dict()
   for col_def in meta_graph_def.collection_def[
-    'variational_dropout'].bytes_list.value:
+      'variational_dropout'].bytes_list.value:
     features = json.loads(col_def)
     features_map.update(features)
 
@@ -50,7 +53,12 @@ def get_feature_importance(pipeline_config, feature_group_name=None):
     group_name = feature_group.group_name
     if feature_group_name is not None and feature_group_name != group_name:
       continue
-    assert group_name in features_map, "%s not in feature map" % group_name
+    # assert group_name in features_map, "%s not in feature map" % group_name
+    if group_name not in features_map:
+      # for now, sequence feature groups are not supported
+      logging.warn('%s not in feature map' % group_name)
+      continue
+
     feature_dims = features_map[group_name]
 
     delta_name = 'fscd_delta_%s' % group_name
@@ -71,26 +79,27 @@ def get_feature_importance(pipeline_config, feature_group_name=None):
       if feature in feature_importance:
         raw = feature_importance[feature]
         if probs[i] > raw:
-          logging.info("%s importance change from %d to %d", feature, raw, probs[i])
+          logging.info('%s importance change from %d to %d', feature, raw,
+                       probs[i])
           feature_importance[feature] = probs[i]
       else:
         feature_importance[feature] = probs[i]
   return feature_importance
 
 
-def get_top_and_bottom_features(pipeline_config, top_k):
-  feature_score = get_feature_importance(pipeline_config)
-  top_features = set()
-  bottom_features = set()
-  for feature, score in feature_score.iteritems():
-    if len(top_features) < top_k:
-      top_features.add(feature)
-    else:
-      bottom_features.add(feature)
-
-  print("selected top %d features:" % top_k, ','.join(top_features))
-  print("removed bottom features:", ','.join(bottom_features))
-  return top_features, bottom_features
+# def get_top_and_bottom_features(pipeline_config, top_k):
+#   feature_score = get_feature_importance(pipeline_config)
+#   top_features = set()
+#   bottom_features = set()
+#   for feature, score in feature_score.iteritems():
+#     if len(top_features) < top_k:
+#       top_features.add(feature)
+#     else:
+#       bottom_features.add(feature)
+#
+#   print("selected top %d features:" % top_k, ','.join(top_features))
+#   print("removed bottom features:", ','.join(bottom_features))
+#   return top_features, bottom_features
 
 
 class FSCDLayer(object):
@@ -114,10 +123,10 @@ def __init__(self,
   def compute_dropout_mask(self, n, temperature=0.1):
     delta_name = 'fscd_delta_%s' % self.name
     delta = tf.get_variable(
-      name=delta_name,
-      shape=[n],
-      dtype=tf.float32,
-      initializer=tf.constant_initializer(0.))
+        name=delta_name,
+        shape=[n],
+        dtype=tf.float32,
+        initializer=tf.constant_initializer(0.))
     delta = tf.nn.sigmoid(delta)
 
     EPSILON = np.finfo(float).eps
@@ -146,8 +155,9 @@ def compute_regular_params(self, cols_to_feature):
       theta = 1.0 - sig_c
       alpha = math.log(sig_c) - math.log(theta)
       alphas[fc] = alpha
-      print(str(fc.raw_name), "complexity:", complexity, "cardinality:", cardinal,
-            "dimension:", dim, "c:", c, "theta:", theta, "alpha:", alpha)
+      print(
+          str(fc.raw_name), 'complexity:', complexity, 'cardinality:', cardinal,
+          'dimension:', dim, 'c:', c, 'theta:', theta, 'alpha:', alpha)
     return alphas
 
   def __call__(self, cols_to_feature):
@@ -171,7 +181,8 @@ def __call__(self, cols_to_feature):
       feature_dimension.append((column.raw_name, int(value.shape[-1])))
 
     output_features = tf.concat(output_tensors, 1)
-    tf.add_to_collection('variational_dropout', json.dumps({self.name: feature_dimension}))
+    tf.add_to_collection('variational_dropout',
+                         json.dumps({self.name: feature_dimension}))
 
     batch_size = tf.shape(output_features)[0]
     t_alpha = tf.convert_to_tensor(alphas, dtype=tf.float32)
diff --git a/easy_rec/python/loss/jrc_loss.py b/easy_rec/python/loss/jrc_loss.py
index fc8266b2c..fc77bda86 100644
--- a/easy_rec/python/loss/jrc_loss.py
+++ b/easy_rec/python/loss/jrc_loss.py
@@ -13,6 +13,7 @@ def jrc_loss(labels,
              session_ids,
              alpha=0.5,
              auto_weight=False,
+             sample_weights=1.0,
              name=''):
   """Joint Optimization of Ranking and Calibration with Contextualized Hybrid Model.
 
@@ -24,13 +25,16 @@ def jrc_loss(labels,
     session_ids: a `Tensor` with shape [batch_size]. Session ids of each sample, used to max GAUC metric. e.g. user_id
     alpha: the weight to balance ranking loss and calibration loss
     auto_weight: bool, whether to learn loss weight between ranking loss and calibration loss
+    sample_weights: Coefficients for the loss. This must be scalar or broadcastable to
+      `labels` (i.e. same rank and each dimension is either 1 or the same).
     name: the name of loss
   """
   loss_name = name if name else 'jrc_loss'
   logging.info('[{}] alpha: {}, auto_weight: {}'.format(loss_name, alpha,
                                                         auto_weight))
 
-  ce_loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+  ce_loss = tf.losses.sparse_softmax_cross_entropy(
+      labels, logits, weights=sample_weights)
 
   labels = tf.expand_dims(labels, 1)  # [B, 1]
   labels = tf.concat([1 - labels, labels], axis=1)  # [B, 2]
@@ -54,6 +58,13 @@ def jrc_loss(labels,
   y_neg, y_pos = y[:, :, 0], y[:, :, 1]
   l_neg, l_pos = logits[:, :, 0], logits[:, :, 1]
 
+  if tf.is_numeric_tensor(sample_weights):
+    logging.info('[%s] use sample weight' % loss_name)
+    weights = tf.expand_dims(tf.cast(sample_weights, tf.float32), 0)
+    pairwise_weights = tf.tile(weights, tf.stack([batch_size, 1]))
+    y_pos *= pairwise_weights
+    y_neg *= pairwise_weights
+
   # Compute list-wise generative loss -log p(x|y, z)
   loss_pos = -tf.reduce_sum(y_pos * tf.nn.log_softmax(l_pos, axis=0), axis=0)
   loss_neg = -tf.reduce_sum(y_neg * tf.nn.log_softmax(l_neg, axis=0), axis=0)
diff --git a/easy_rec/python/tools/feature_selection.py b/easy_rec/python/tools/feature_selection.py
index cbe717351..065993652 100644
--- a/easy_rec/python/tools/feature_selection.py
+++ b/easy_rec/python/tools/feature_selection.py
@@ -10,6 +10,7 @@
 import tensorflow as tf
 from tensorflow.python.framework.meta_graph import read_meta_graph_file
 
+from easy_rec.python.protos.feature_config_pb2 import FeatureConfig
 from easy_rec.python.utils import config_util
 
 if tf.__version__ >= '2.0':
@@ -19,8 +20,9 @@
 matplotlib.use('Agg')  # NOQA
 import matplotlib.pyplot as plt  # NOQA
 
-tf.app.flags.DEFINE_string('model_type', 'variational_dropout',
-                           'feature selection model type')
+tf.app.flags.DEFINE_enum('model_type', 'variational_dropout',
+                         ['variational_dropout', 'fscd'],
+                         'feature selection model type')
 tf.app.flags.DEFINE_string('config_path', '',
                            'feature selection model config path')
 tf.app.flags.DEFINE_string('checkpoint_path', None,
@@ -295,6 +297,7 @@ def _visualize_feature_importance(self, feature_importance, group_name):
 
 
 class FSCD(object):
+
   def __init__(self,
                config_path,
                output_dir,
@@ -318,11 +321,16 @@ def process(self):
         'variational_dropout'), 'variational_dropout must be in model_config'
 
     feature_importance_map = {}
+    white_feature_group = set()
     from easy_rec.python.layers.fscd_layer import get_feature_importance
     for feature_group in config.model_config.feature_groups:
       group_name = feature_group.group_name
       tf.logging.info('Calculating %s feature importance ...' % group_name)
       feature_importance = get_feature_importance(config, group_name)
+      if len(feature_importance) == 0:
+        tf.logging.info('No feature importance in group %s' % group_name)
+        white_feature_group.add(group_name)
+        continue
       feature_importance_map[group_name] = feature_importance
 
       tf.logging.info('Dump %s  feature importance to csv ...' % group_name)
@@ -333,7 +341,7 @@ def process(self):
         self._visualize_feature_importance(feature_importance, group_name)
 
     tf.logging.info('Processing model config ...')
-    self._process_config(feature_importance_map)
+    self._process_config(feature_importance_map, white_feature_group)
 
   def _dump_to_csv(self, feature_importance, group_name):
     """Dump feature importance data to a csv file."""
@@ -355,8 +363,8 @@ def _visualize_feature_importance(self, feature_importance, group_name):
     df.reset_index(inplace=True)
     # Draw plot
     plt.figure(figsize=(90, 200), dpi=100)
-    plt.hlines(y=df.index, xmin=0, xmax=df.mean_drop_p)
-    for x, y, tex in zip(df.mean_drop_p, df.index, df.mean_drop_p):
+    plt.hlines(y=df.index, xmin=0, xmax=df.importance)
+    for x, y, tex in zip(df.importance, df.index, df.importance):
       plt.text(
           x,
           y,
@@ -377,6 +385,69 @@ def _visualize_feature_importance(self, feature_importance, group_name):
                      'feature_importance_pic_%s.png' % group_name), 'wb') as f:
       plt.savefig(f, format='png')
 
+  def _process_config(self, feature_importance_map, white_feature_group):
+    """Process model config and fg config with feature selection."""
+    excluded_features = set()
+    for group_name, feature_importance in feature_importance_map.items():
+      for i, (feature_name, _) in enumerate(feature_importance.items()):
+        if i >= self._topk:
+          excluded_features.add(feature_name)
+
+    config = config_util.get_configs_from_pipeline_file(self._config_path)
+    # keep sequence features and side-infos
+    sequence_features = set()
+    for feature_group in config.model_config.feature_groups:
+      for sequence_feature in feature_group.sequence_features:
+        for seq_att_map in sequence_feature.seq_att_map:
+          for key in seq_att_map.key:
+            sequence_features.add(key)
+          for hist_seq in seq_att_map.hist_seq:
+            sequence_features.add(hist_seq)
+    # compat with din
+    for sequence_feature in config.model_config.seq_att_groups:
+      for seq_att_map in sequence_feature.seq_att_map:
+        for key in seq_att_map.key:
+          sequence_features.add(key)
+        for hist_seq in seq_att_map.hist_seq:
+          sequence_features.add(hist_seq)
+    # sequence feature group
+    for feature_group in config.model_config.feature_groups:
+      group_name = feature_group.group_name
+      if group_name not in white_feature_group:
+        continue
+      for feature_name in feature_group.feature_names:
+        sequence_features.add(feature_name)
+
+    excluded_features = excluded_features - sequence_features
+
+    for feature_config in config_util.get_compatible_feature_configs(config):
+      feature_name = feature_config.input_names[0]
+      if feature_config.HasField('feature_name'):
+          feature_name = feature_config.feature_name
+      if feature_name in excluded_features:
+        feature_config.feature_type = FeatureConfig.FeatureType.ConstFeature
+
+    config.model_config.ClearField('variational_dropout')
+    config_util.save_message(
+        config,
+        os.path.join(self._output_dir, os.path.basename(self._config_path)))
+
+    if self._fg_path is not None and len(self._fg_path) > 0:
+      with tf.gfile.Open(self._fg_path) as f:
+        fg_json = json.load(f, object_pairs_hook=OrderedDict)
+        features = []
+        for feature in fg_json['features']:
+          if 'feature_name' in feature:
+            if feature['feature_name'] not in excluded_features:
+              features.append(feature)
+          else:
+            features.append(feature)
+        fg_json['features'] = features
+
+      fg_file = os.path.join(self._output_dir, os.path.basename(self._fg_path))
+      with tf.gfile.Open(fg_file, 'w') as f:
+        json.dump(fg_json, f, indent=4)
+
 
 if __name__ == '__main__':
   if FLAGS.model_type == 'variational_dropout':
@@ -390,12 +461,12 @@ def _visualize_feature_importance(self, feature_importance, group_name):
     fs.process()
   elif FLAGS.model_type == 'fscd':
     fs = FSCD(
-      FLAGS.config_path,
-      FLAGS.output_dir,
-      FLAGS.topk,
-      checkpoint_path=FLAGS.checkpoint_path,
-      fg_path=FLAGS.fg_path,
-      visualize=FLAGS.visualize)
+        FLAGS.config_path,
+        FLAGS.output_dir,
+        FLAGS.topk,
+        checkpoint_path=FLAGS.checkpoint_path,
+        fg_path=FLAGS.fg_path,
+        visualize=FLAGS.visualize)
     fs.process()
   else:
     raise ValueError('Unknown feature selection model type %s' %
diff --git a/easy_rec/python/tools/view_saved_model.py b/easy_rec/python/tools/view_saved_model.py
new file mode 100644
index 000000000..a3c01909b
--- /dev/null
+++ b/easy_rec/python/tools/view_saved_model.py
@@ -0,0 +1,38 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import argparse
+import logging
+
+from google.protobuf import text_format
+from tensorflow.python.platform.gfile import GFile
+from tensorflow.core.protobuf import saved_model_pb2
+
+logging.basicConfig(
+    format='[%(levelname)s] %(asctime)s %(filename)s:%(lineno)d : %(message)s',
+    level=logging.INFO)
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--input', type=str, default=None, help='saved model path')
+  parser.add_argument('--output', type=str, default=None, help='saved model save path')    
+  args = parser.parse_args()
+
+  assert args.input is not None and args.output is not None
+
+  logging.info('saved_model_path: %s' % args.input)
+
+  saved_model = saved_model_pb2.SavedModel()
+  if args.input.endswith('.pb'):
+    with GFile(args.input, 'rb') as fin:
+      saved_model.ParseFromString(fin.read())
+  else:
+    with GFile(args.input, 'r') as fin:
+      text_format.Merge(fin.read(), saved_model)
+   
+  if args.output.endswith('.pbtxt'):
+    with GFile(args.output, 'w') as fout:
+      fout.write(text_format.MessageToString(saved_model, as_utf8=True))
+  else:
+    with GFile(args.output, 'wb') as fout:
+      fout.write(saved_model.SerializeToString())
diff --git a/setup.cfg b/setup.cfg
index 2303ef802..cd2b0ac0c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ multi_line_output = 7
 force_single_line = true
 known_standard_library = setuptools
 known_first_party = easy_rec
-known_third_party = absl,common_io,docutils,eas_prediction,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
+known_third_party = absl,common_io,docutils,eas_prediction,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,skimage,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
 no_lines_before = LOCALFOLDER
 default_section = THIRDPARTY
 skip = easy_rec/python/protos

From 524ce671c7445f4842e557a42fbb472f66de337b Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 15 May 2023 14:25:45 +0800
Subject: [PATCH 26/54] [feat]: add feature selection tool

---
 .../feature_column/feature_column_v2.py       |  4 +-
 easy_rec/python/layers/fscd_layer.py          | 15 -------
 easy_rec/python/model/multi_task_model.py     | 42 ++++++++++++++-----
 easy_rec/python/model/rank_model.py           | 34 ++++++++++-----
 easy_rec/python/protos/easy_rec_model.proto   |  9 +++-
 5 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py
index 578b0a50a..eb952e7be 100644
--- a/easy_rec/python/compat/feature_column/feature_column_v2.py
+++ b/easy_rec/python/compat/feature_column/feature_column_v2.py
@@ -1338,7 +1338,7 @@ def constant_numeric_column(key,
   Example:
 
   ```python
-  price = numeric_column('price')
+  price = constant_numeric_column('price')
   columns = [price, ...]
   features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
   dense_tensor = input_layer(features, columns)
@@ -1369,7 +1369,7 @@ def constant_numeric_column(key,
       non-quantized, real integer or floating point type.
 
   Returns:
-    A `NumericColumn`.
+    A `ConstantNumericColumn`.
 
   Raises:
     TypeError: if any dimension in shape is not an int
diff --git a/easy_rec/python/layers/fscd_layer.py b/easy_rec/python/layers/fscd_layer.py
index 163cf18f7..2b1071787 100644
--- a/easy_rec/python/layers/fscd_layer.py
+++ b/easy_rec/python/layers/fscd_layer.py
@@ -87,21 +87,6 @@ def get_feature_importance(pipeline_config, feature_group_name=None):
   return feature_importance
 
 
-# def get_top_and_bottom_features(pipeline_config, top_k):
-#   feature_score = get_feature_importance(pipeline_config)
-#   top_features = set()
-#   bottom_features = set()
-#   for feature, score in feature_score.iteritems():
-#     if len(top_features) < top_k:
-#       top_features.add(feature)
-#     else:
-#       bottom_features.add(feature)
-#
-#   print("selected top %d features:" % top_k, ','.join(top_features))
-#   print("removed bottom features:", ','.join(bottom_features))
-#   return top_features, bottom_features
-
-
 class FSCDLayer(object):
   """Rank features by variational dropout.
 
diff --git a/easy_rec/python/model/multi_task_model.py b/easy_rec/python/model/multi_task_model.py
index 43e5663ce..a6bd1b29d 100644
--- a/easy_rec/python/model/multi_task_model.py
+++ b/easy_rec/python/model/multi_task_model.py
@@ -88,6 +88,17 @@ def build_metric_graph(self, eval_config):
 
   def build_loss_graph(self):
     """Build loss graph for multi task model."""
+    strategy = self._base_model_config.loss_weight_strategy
+    loss_weight_arr = [1.0] * len(self._task_towers)
+    if strategy == self._base_model_config.Random:
+      num = 0
+      for task_tower_cfg in self._task_towers:
+        losses = task_tower_cfg.losses
+        num += 1 if len(losses) == 0 else len(losses)
+      weights = tf.random_normal([num])
+      loss_weight_arr = tf.nn.softmax(weights)
+
+    offset = 0
     for task_tower_cfg in self._task_towers:
       tower_name = task_tower_cfg.tower_name
       loss_weight = task_tower_cfg.weight
@@ -111,8 +122,12 @@ def build_loss_graph(self):
             loss_weight=loss_weight,
             num_class=task_tower_cfg.num_class,
             suffix='_%s' % tower_name)
+        if strategy == self._base_model_config.Random:
+          for loss_name in loss_dict.keys():
+            loss_dict[loss_name] = loss_dict[loss_name] * loss_weight_arr[offset]
+        offset += 1
       else:
-        for loss in losses:
+        for i, loss in enumerate(losses):
           loss_param = loss.WhichOneof('loss_param')
           if loss_param is not None:
             loss_param = getattr(loss, loss_param)
@@ -125,19 +140,26 @@ def build_loss_graph(self):
               loss_name=loss.loss_name,
               loss_param=loss_param)
           for loss_name, loss_value in loss_ops.items():
-            if loss.learn_loss_weight:
-              uncertainty = tf.Variable(
+            if strategy == self._base_model_config.Fixed:
+              loss_dict[loss_name] = loss_value * loss.weight
+            elif strategy == self._base_model_config.Uncertainty:
+              if loss.learn_loss_weight:
+                uncertainty = tf.Variable(
                   0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
-              tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
-              if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
-                loss_dict[loss_name] = 0.5 * tf.exp(
+                tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
+                if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
+                  loss_dict[loss_name] = 0.5 * tf.exp(
                     -uncertainty) * loss_value + 0.5 * uncertainty
-              else:
-                loss_dict[loss_name] = tf.exp(
+                else:
+                  loss_dict[loss_name] = tf.exp(
                     -uncertainty) * loss_value + 0.5 * uncertainty
+              else:
+                loss_dict[loss_name] = loss_value * loss.weight
+            elif strategy == self._base_model_config.Random:
+              loss_dict[loss_name] = loss_value * loss_weight_arr[i + offset]
             else:
-              loss_dict[loss_name] = loss_value * loss.weight
-
+              raise ValueError("Unsupported loss weight strategy: " + strategy.Name)
+        offset += len(losses)
       self._loss_dict.update(loss_dict)
 
     kd_loss_dict = loss_builder.build_kd_loss(self.kd, self._prediction_dict,
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index 25eff23ea..e4a38fa2d 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -193,7 +193,12 @@ def build_loss_graph(self):
           loss_weight=self._sample_weight,
           num_class=self._num_class)
     else:
-      for loss in self._losses:
+      strategy = self._base_model_config.loss_weight_strategy
+      loss_weight = [1.0]
+      if strategy == self._base_model_config.Random and len(self._losses) > 1:
+        weights = tf.random_normal([len(self._losses)])
+        loss_weight = tf.nn.softmax(weights)
+      for i, loss in enumerate(self._losses):
         loss_param = loss.WhichOneof('loss_param')
         if loss_param is not None:
           loss_param = getattr(loss, loss_param)
@@ -205,18 +210,25 @@ def build_loss_graph(self):
             loss_name=loss.loss_name,
             loss_param=loss_param)
         for loss_name, loss_value in loss_ops.items():
-          if loss.learn_loss_weight:
-            uncertainty = tf.Variable(
-                0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
-            tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
-            if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
-              loss_dict[loss_name] = 0.5 * tf.exp(
-                  -uncertainty) * loss_value + 0.5 * uncertainty
+          if strategy == self._base_model_config.Fixed:
+            loss_dict[loss_name] = loss_value * loss.weight
+          elif strategy == self._base_model_config.Uncertainty:
+            if loss.learn_loss_weight:
+              uncertainty = tf.Variable(
+                  0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
+              tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
+              if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
+                loss_dict[loss_name] = 0.5 * tf.exp(
+                    -uncertainty) * loss_value + 0.5 * uncertainty
+              else:
+                loss_dict[loss_name] = tf.exp(
+                    -uncertainty) * loss_value + 0.5 * uncertainty
             else:
-              loss_dict[loss_name] = tf.exp(
-                  -uncertainty) * loss_value + 0.5 * uncertainty
+              loss_dict[loss_name] = loss_value * loss.weight
+          elif strategy == self._base_model_config.Random:
+            loss_dict[loss_name] = loss_value * loss_weight[i]
           else:
-            loss_dict[loss_name] = loss_value * loss.weight
+            raise ValueError("Unsupported loss weight strategy: " + strategy.Name)
 
     self._loss_dict.update(loss_dict)
 
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 42f454d95..770611880 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -103,6 +103,13 @@ message EasyRecModel {
 
     repeated Loss losses = 15;
 
+    enum LossWeightStrategy {
+        Fixed = 0;
+        Uncertainty = 1;
+        Random = 2;
+    }
+    required LossWeightStrategy loss_weight_strategy = 16 [default = Fixed];
+
     // dnn layers after sequence feature
-    optional DNN sequence_dnn = 16;
+    optional DNN sequence_dnn = 17;
 }

From 3a8d7329a378f41b4bf48d3b8e007f799006cca6 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Thu, 25 May 2023 11:15:06 +0800
Subject: [PATCH 27/54] [feat]: add fibinet & masknet

---
 .../compat/feature_column/feature_column.py   |  24 ++-
 .../feature_column/feature_column_v2.py       |   6 +-
 easy_rec/python/inference/predictor.py        |   2 +-
 easy_rec/python/input/input.py                |   5 +-
 easy_rec/python/layers/common_layers.py       | 171 ++++++++++++++++++
 easy_rec/python/layers/fibinet.py             |  53 ++++++
 easy_rec/python/layers/fscd_layer.py          |  57 ++++--
 easy_rec/python/layers/input_layer.py         |  10 +-
 easy_rec/python/layers/mask_net.py            |  73 ++++++++
 easy_rec/python/model/dbmtl.py                |  13 ++
 easy_rec/python/model/easy_rec_model.py       |   3 +-
 easy_rec/python/model/multi_task_model.py     |  19 +-
 easy_rec/python/model/rank_model.py           |   3 +-
 easy_rec/python/protos/dbmtl.proto            |   6 +
 easy_rec/python/protos/easy_rec_model.proto   |   2 +
 easy_rec/python/protos/fibinet.proto          |  15 ++
 easy_rec/python/protos/masknet.proto          |  17 ++
 .../python/protos/variational_dropout.proto   |   6 +-
 easy_rec/python/tools/feature_selection.py    |   2 +-
 easy_rec/python/tools/view_saved_model.py     |   7 +-
 20 files changed, 452 insertions(+), 42 deletions(-)
 create mode 100644 easy_rec/python/layers/fibinet.py
 create mode 100644 easy_rec/python/layers/mask_net.py
 create mode 100644 easy_rec/python/protos/fibinet.proto
 create mode 100644 easy_rec/python/protos/masknet.proto

diff --git a/easy_rec/python/compat/feature_column/feature_column.py b/easy_rec/python/compat/feature_column/feature_column.py
index 56d3357c7..09d791386 100644
--- a/easy_rec/python/compat/feature_column/feature_column.py
+++ b/easy_rec/python/compat/feature_column/feature_column.py
@@ -167,6 +167,7 @@
 
 from easy_rec.python.compat import embedding_ops as ev_embedding_ops
 from easy_rec.python.compat.feature_column import utils as fc_utils
+from easy_rec.python.layers.common_layers import layer_norm
 
 
 def _internal_input_layer(features,
@@ -177,7 +178,8 @@ def _internal_input_layer(features,
                           scope=None,
                           cols_to_output_tensors=None,
                           from_template=False,
-                          feature_name_to_output_tensors=None):
+                          feature_name_to_output_tensors=None,
+                          do_normalize=False):
   """See input_layer, `scope` is a name or variable scope to use."""
   feature_columns = _normalize_feature_columns(feature_columns)
   for column in feature_columns:
@@ -208,6 +210,18 @@ def _get_logits():  # pylint: disable=missing-docstring
         batch_size = array_ops.shape(tensor)[0]
         output_tensor = array_ops.reshape(
             tensor, shape=(batch_size, num_elements))
+        if do_normalize:
+          from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn, NumericColumn, \
+            WeightedCategoricalColumn
+          from tensorflow.python.layers.normalization import batch_normalization
+          if isinstance(column, EmbeddingColumn) or isinstance(column, _SharedEmbeddingColumn):
+            fc = column.categorical_column
+            if isinstance(fc, WeightedCategoricalColumn) and fc.weight_feature_key.endswith('_raw_proj_val'):
+              output_tensor = layer_norm(output_tensor, name='ln_' + column.name)
+            else:
+              output_tensor = batch_normalization(output_tensor, name='bn_'+column.name)
+          elif isinstance(column, NumericColumn) and int(column.shape[-1]) > 1:
+            output_tensor = layer_norm(output_tensor, name='ln_' + column.name)
         output_tensors.append(output_tensor)
         if cols_to_vars is not None:
           # Retrieve any variables created (some _DenseColumn's don't create
@@ -239,7 +253,8 @@ def input_layer(features,
                 trainable=True,
                 cols_to_vars=None,
                 cols_to_output_tensors=None,
-                feature_name_to_output_tensors=None):
+                feature_name_to_output_tensors=None,
+                do_normalize=False):
   """Returns a dense `Tensor` as input layer based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -287,6 +302,8 @@ def input_layer(features,
     cols_to_output_tensors: If not `None`, must be a dictionary that will be
       filled with a mapping from '_FeatureColumn' to the associated
       output `Tensor`s.
+    do_normalize: Whether to do layer normalization for numerical features and
+      batch normalization operation for categorical features.
 
   Returns:
     A `Tensor` which represents input layer of a model. Its shape
@@ -303,7 +320,8 @@ def input_layer(features,
       trainable=trainable,
       cols_to_vars=cols_to_vars,
       cols_to_output_tensors=cols_to_output_tensors,
-      feature_name_to_output_tensors=feature_name_to_output_tensors)
+      feature_name_to_output_tensors=feature_name_to_output_tensors,
+      do_normalize=do_normalize)
 
 
 # TODO(akshayka): InputLayer should be a subclass of Layer, and it
diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py
index eb952e7be..c264c30c2 100644
--- a/easy_rec/python/compat/feature_column/feature_column_v2.py
+++ b/easy_rec/python/compat/feature_column/feature_column_v2.py
@@ -2687,9 +2687,9 @@ def _normalize_feature_columns(feature_columns):
 class ConstantNumericColumn(
     DenseColumn,
     fc_old._DenseColumn,  # pylint: disable=protected-access
-    collections.namedtuple('ConstantNumericColumn',
-                           ('feature_name', 'key', 'shape', 'default_value',
-                            'dtype'))):
+    collections.namedtuple(
+        'ConstantNumericColumn',
+        ('feature_name', 'key', 'shape', 'default_value', 'dtype'))):
   """see `numeric_column`."""
 
   @property
diff --git a/easy_rec/python/inference/predictor.py b/easy_rec/python/inference/predictor.py
index e39592c18..e17871892 100644
--- a/easy_rec/python/inference/predictor.py
+++ b/easy_rec/python/inference/predictor.py
@@ -223,7 +223,7 @@ def _build_model(self):
               input_name = tensor.name
               input_name, _ = input_name.split(':')
               input_op = self._graph.get_operation_by_name(input_name)
-              if input_op.type == "PlaceholderWithDefault":
+              if input_op.type == 'PlaceholderWithDefault':
                 continue
               try:
                 input_id = input_name.split('_')[-1]
diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index 2775ad1ac..d2325e680 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -515,8 +515,9 @@ def expand_input():
       multiples[0] = batch_size
       return tf.tile(input_tensor, multiples)
 
-    input_tensor = tf.cond(tf.equal(tf.shape(input_tensor)[0], batch_size),
-                           lambda: input_tensor, expand_input)
+    input_tensor = tf.cond(
+        tf.equal(tf.shape(input_tensor)[0], batch_size), lambda: input_tensor,
+        expand_input)
     feature_name = fc.feature_name if fc.HasField('feature_name') else input_0
     parsed_dict[feature_name] = input_tensor
 
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index 165fce5e1..e3bb65f64 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -1,8 +1,12 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import itertools
+import logging
 
 import tensorflow as tf
 
+from easy_rec.python.compat.layers import layer_norm as tf_layer_norm
+
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
 
@@ -61,3 +65,170 @@ def text_cnn(x,
   pool_flat = tf.concat(
       pooled_outputs, 1)  # shape: (batch_size, num_filters * len(filter_sizes))
   return pool_flat
+
+
+def layer_norm(input_tensor, name=None, reuse=None):
+  """Run layer normalization on the last dimension of the tensor."""
+  return tf_layer_norm(
+      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, reuse=reuse, scope=name)
+
+
+class SENet(object):
+  """
+    SENet+ Layer，支持不同field的embedding dimension不等
+    arxiv: 2209.05016
+    """
+
+  def __init__(self, reduction_ratio, num_groups, name='SENet'):
+    self.reduction_ratio = reduction_ratio
+    self.num_groups = num_groups
+    self.name = name
+
+  def __call__(self, embedding_list):
+    """
+
+      :param embedding_list: [embedding_1,...,embedding_i,...,embedding_f]，f为field的数目，embedding_i is [bs, dim]
+      :return:
+      """
+    print("SENET layer with %d inputs" % len(embedding_list))
+    for emb in embedding_list:
+      assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
+
+    field_size = len(embedding_list)
+    feature_size_list = [emb.shape.as_list()[-1] for emb in embedding_list]
+
+    # Squeeze
+    g = self.num_groups
+    # embedding dimension 必须能被 g 整除
+    group_embs = [
+        tf.reshape(emb, [-1, g, tf.shape(emb)[-1] // g])
+        for emb in embedding_list
+    ]
+
+    squeezed = []
+    for emb in group_embs:
+      squeezed.append(tf.reduce_max(emb, axis=-1))
+      squeezed.append(tf.reduce_mean(emb, axis=-1))
+    z = tf.concat(squeezed, axis=1)  # [bs, field_size * num_groups * 2]
+
+    # Excitation
+    reduction_size = max(1, field_size * g * 2 // self.reduction_ratio)
+
+    initializer = tf.glorot_normal_initializer()
+    a1 = tf.layers.dense(
+        z,
+        reduction_size,
+        kernel_initializer=initializer,
+        activation=tf.nn.relu,
+        name='%s/W1' % self.name)
+    a2 = tf.layers.dense(
+        a1,
+        sum(feature_size_list),
+        kernel_initializer=initializer,
+        name='%s/W2' % self.name)
+
+    # Re-weight & Fuse
+    a = tf.split(a2, feature_size_list, axis=1)
+    senet_like_embeddings = [
+        layer_norm(emb * w + emb) for emb, w in zip(embedding_list, a)
+    ]
+    return tf.concat(senet_like_embeddings, axis=-1)
+
+
+def _full_interaction(v_i, v_j):
+  # [bs, 1, dim] x [bs, dim, 1] = [bs, 1]
+  interaction = tf.matmul(
+      tf.expand_dims(v_i, axis=1), tf.expand_dims(v_j, axis=-1))
+  return tf.squeeze(interaction, axis=1)
+
+
+class BiLinear(object):
+
+  def __init__(self,
+               output_size,
+               bilinear_type,
+               bilinear_plus=True,
+               name='bilinear'):
+    """双线性特征交互层，支持不同field embeddings的size不等.
+
+    arxiv: 2209.05016
+    :param output_size: 输出的size
+    :param bilinear_type: ['all', 'each', 'interaction']，支持其中一种
+    :param bilinear_plus: 是否使用bi-linear+
+    """
+    self.name = name
+    self.bilinear_type = bilinear_type.lower()
+    self.output_size = output_size
+
+    if bilinear_type not in ['all', 'each', 'interaction']:
+      raise NotImplementedError(
+          "bilinear_type only support: ['all', 'each', 'interaction']")
+
+    if bilinear_plus:
+      self.func = _full_interaction
+    else:
+      self.func = tf.multiply
+
+  def __call__(self, embeddings):
+    print("Bilinear Layer with %d inputs" % len(embeddings))
+    if len(embeddings) > 200:
+      logging.warn("There are too many inputs for bilinear layer: %d" % len(embeddings))
+    equal_dim = True
+    _dim = embeddings[0].shape[-1]
+    for emb in embeddings:
+      assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
+      if emb.shape[-1] != _dim:
+        equal_dim = False
+    if not equal_dim and self.bilinear_type != 'interaction':
+      raise ValueError('all embedding dimensions must be same when use bilinear type: interaction')
+    dim = int(_dim)
+
+    field_size = len(embeddings)
+    initializer = tf.glorot_normal_initializer()
+
+    # bi-linear+: p的维度为[bs, f*(f-1)/2]
+    # bi-linear:
+    # 当equal_dim=True时，p的维度为[bs, f*(f-1)/2*k]，k为embeddings的size
+    # 当equal_dim=False时，p的维度为[bs, (k_2+k_3+...+k_f)+...+(k_i+k_{i+1}+...+k_f)+...+k_f]，
+    # 其中 k_i为第i个field的embedding的size
+    if self.bilinear_type == 'all':
+      v_dot = [
+          tf.layers.dense(
+              v_i,
+              dim,
+              kernel_initializer=initializer,
+              name='%s/all' % self.name,
+              reuse=tf.AUTO_REUSE) for v_i in embeddings[:-1]
+      ]
+      p = [
+          self.func(v_dot[i], embeddings[j])
+          for i, j in itertools.combinations(range(field_size), 2)
+      ]
+    elif self.bilinear_type == 'each':
+      v_dot = [
+          tf.layers.dense(
+              v_i,
+              dim,
+              kernel_initializer=initializer,
+              name='%s/each_%d' % (self.name, i),
+              reuse=tf.AUTO_REUSE) for i, v_i in enumerate(embeddings[:-1])
+      ]
+      p = [
+          self.func(v_dot[i], embeddings[j])
+          for i, j in itertools.combinations(range(field_size), 2)
+      ]
+    else:  # interaction
+      p = [
+          self.func(
+              tf.layers.dense(
+                  embeddings[i],
+                  embeddings[j].shape.as_list()[-1],
+                  kernel_initializer=initializer,
+                  name='%s/interaction_%d_%d' % (self.name, i, j),
+                  reuse=tf.AUTO_REUSE), embeddings[j])
+          for i, j in itertools.combinations(range(field_size), 2)
+      ]
+
+    output = tf.layers.dense(
+        tf.concat(p, axis=-1), self.output_size, kernel_initializer=initializer)
+    return output
diff --git a/easy_rec/python/layers/fibinet.py b/easy_rec/python/layers/fibinet.py
new file mode 100644
index 000000000..9a419e004
--- /dev/null
+++ b/easy_rec/python/layers/fibinet.py
@@ -0,0 +1,53 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+from easy_rec.python.layers.common_layers import SENet
+from easy_rec.python.layers.common_layers import BiLinear
+from easy_rec.python.layers import dnn
+
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class FiBiNetLayer(object):
+  """FiBiNet++:Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction.
+
+  This is almost an exact implementation of the original FiBiNet++ model.
+  See the original paper:
+  https://arxiv.org/pdf/2209.05016.pdf
+  """
+
+  def __init__(self, fibinet_config, features, input_layer):
+    self._config = fibinet_config
+    self._input_layer = input_layer
+    self._features = features
+
+  def __call__(self, group_name, is_training, l2_reg=0, *args, **kwargs):
+    feature_list = []
+    _, group_features = self._input_layer(self._features, group_name)
+    senet = SENet(reduction_ratio=self._config.senet_reduction_ratio,
+                       num_groups=self._config.num_senet_squeeze_group,
+                       name='%s_senet' % group_name)
+    senet_output = senet(group_features)
+    feature_list.append(senet_output)
+
+    if self._config.bilinear_type != 'none':
+      bilinear = BiLinear(output_size=self._config.bilinear_output_units,
+                          bilinear_type=self._config.bilinear_type,
+                          bilinear_plus=self._config.use_bilinear_plus,
+                          name='%s_bilinear' % group_name)
+      bilinear_output = bilinear(group_features)
+      feature_list.append(bilinear_output)
+
+    if len(feature_list) > 1:
+      feature = tf.concat(feature_list, axis=-1)
+    else:
+      feature = feature_list[0]
+
+    final_dnn = dnn.DNN(
+      self._config.mlp,
+      l2_reg,
+      name='%s_fibinet_mlp' % group_name,
+      is_training=is_training)
+    return final_dnn(feature)
diff --git a/easy_rec/python/layers/fscd_layer.py b/easy_rec/python/layers/fscd_layer.py
index 2b1071787..ec115f547 100644
--- a/easy_rec/python/layers/fscd_layer.py
+++ b/easy_rec/python/layers/fscd_layer.py
@@ -35,14 +35,14 @@ def sigmoid(x):
 
 def get_feature_importance(pipeline_config, feature_group_name=None):
   assert pipeline_config.model_config.HasField(
-      'variational_dropout'), 'variational_dropout must be in model_config'
+    'variational_dropout'), 'variational_dropout must be in model_config'
 
   checkpoint_path = tf.train.latest_checkpoint(pipeline_config.model_dir)
   meta_graph_def = read_meta_graph_file(checkpoint_path + '.meta')
 
   features_map = dict()
   for col_def in meta_graph_def.collection_def[
-      'variational_dropout'].bytes_list.value:
+    'variational_dropout'].bytes_list.value:
     features = json.loads(col_def)
     features_map.update(features)
 
@@ -105,24 +105,30 @@ def __init__(self,
     self.name = name
     self.feature_complexity = get_feature_complexity(feature_configs)
 
-  def compute_dropout_mask(self, n, temperature=0.1):
+  def compute_dropout_mask(self, n):
     delta_name = 'fscd_delta_%s' % self.name
     delta = tf.get_variable(
-        name=delta_name,
-        shape=[n],
-        dtype=tf.float32,
-        initializer=tf.constant_initializer(0.))
+      name=delta_name,
+      shape=[n],
+      dtype=tf.float32,
+      initializer=tf.constant_initializer(0.))
     delta = tf.nn.sigmoid(delta)
+    epsilon = np.finfo(float).eps
+    max_keep_ratio = self._config.max_keep_ratio
+    min_keep_ratio = self._config.min_keep_ratio
+    if max_keep_ratio >= 1.0:
+      max_keep_ratio = 1.0 - epsilon
+    if min_keep_ratio <= 0.0:
+      min_keep_ratio = epsilon
+    delta = tf.clip_by_value(delta, min_keep_ratio, max_keep_ratio)
 
-    EPSILON = np.finfo(float).eps
     unif_noise = tf.random_uniform([n],
                                    dtype=tf.float32,
                                    seed=None,
                                    name='uniform_noise')
-    approx = (
-        tf.log(delta + EPSILON) - tf.log(1. - delta + EPSILON) +
-        tf.log(unif_noise + EPSILON) - tf.log(1. - unif_noise + EPSILON))
-    return tf.sigmoid(approx / temperature)
+    approx = (tf.log(delta) - tf.log(1. - delta) +
+              tf.log(unif_noise) - tf.log(1. - unif_noise))
+    return tf.sigmoid(approx / self._config.temperature), delta
 
   def compute_regular_params(self, cols_to_feature):
     alphas = {}
@@ -141,8 +147,8 @@ def compute_regular_params(self, cols_to_feature):
       alpha = math.log(sig_c) - math.log(theta)
       alphas[fc] = alpha
       print(
-          str(fc.raw_name), 'complexity:', complexity, 'cardinality:', cardinal,
-          'dimension:', dim, 'c:', c, 'theta:', theta, 'alpha:', alpha)
+        str(fc.raw_name), 'complexity:', complexity, 'cardinality:', cardinal,
+        'dimension:', dim, 'c:', c, 'theta:', theta, 'alpha:', alpha)
     return alphas
 
   def __call__(self, cols_to_feature):
@@ -152,14 +158,21 @@ def __call__(self, cols_to_feature):
     feature_dimension = []
     output_tensors = []
     alphas = []
-    z = self.compute_dropout_mask(len(cols_to_feature))  # keep ratio
+    z, delta = self.compute_dropout_mask(len(cols_to_feature))  # keep ratio
+    tf.summary.histogram('fscd_keep_ratio', delta)
+    tf.summary.histogram('fscd_keep_mask', z)
     regular = self.compute_regular_params(cols_to_feature)
+
     feature_columns = cols_to_feature.keys()
     for column in sorted(feature_columns, key=lambda x: x.name):
       value = cols_to_feature[column]
       alpha = regular[column]
       i = len(output_tensors)
-      out = value * z[i] if self.is_training else value
+      if self.is_training:
+        scaled_value = tf.div(value, delta[i])
+        out = tf.multiply(scaled_value, z[i], name='fscd_dropout')
+      else:
+        out = value
       cols_to_feature[column] = out
       output_tensors.append(out)
       alphas.append(alpha)
@@ -175,3 +188,15 @@ def __call__(self, cols_to_feature):
 
     tf.add_to_collection('variational_dropout_loss', loss)
     return output_features
+
+
+# def dropout(p):
+#    u = np.random.uniform()
+#    x = math.log(p) - math.log(1-p) + math.log(u) - math.log(1-u)
+#    z = sigmoid(x/0.1)
+#    return z
+#
+#
+# if __name__ == '__main__':
+#    for i in range(100):
+#      print(dropout(0.5))
diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py
index 7e28458d5..ced65c0cf 100644
--- a/easy_rec/python/layers/input_layer.py
+++ b/easy_rec/python/layers/input_layer.py
@@ -38,7 +38,8 @@ def __init__(self,
                ev_params=None,
                embedding_regularizer=None,
                kernel_regularizer=None,
-               is_training=False):
+               is_training=False,
+               do_feature_normalize=False):
     self._feature_configs = feature_configs
     self._feature_groups = {
         x.group_name: FeatureGroup(x) for x in feature_groups_config
@@ -66,6 +67,7 @@ def __init__(self,
     self._kernel_regularizer = kernel_regularizer
     self._is_training = is_training
     self._variational_dropout_config = variational_dropout_config
+    self._do_feature_normalize = do_feature_normalize
 
   def has_group(self, group_name):
     return group_name in self._feature_groups
@@ -135,7 +137,8 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
             features,
             group_columns,
             cols_to_output_tensors=cols_to_output_tensors,
-            feature_name_to_output_tensors=feature_name_to_output_tensors)
+            feature_name_to_output_tensors=feature_name_to_output_tensors,
+            do_normalize=self._do_feature_normalize)
         group_features = [cols_to_output_tensors[x] for x in group_columns]
 
         for col, val in cols_to_output_tensors.items():
@@ -185,7 +188,8 @@ def single_call_input_layer(self,
         features,
         group_columns,
         cols_to_output_tensors=cols_to_output_tensors,
-        feature_name_to_output_tensors=feature_name_to_output_tensors)
+        feature_name_to_output_tensors=feature_name_to_output_tensors,
+        do_normalize=self._do_feature_normalize)
 
     embedding_reg_lst = []
     builder = feature_column._LazyBuilder(features)
diff --git a/easy_rec/python/layers/mask_net.py b/easy_rec/python/layers/mask_net.py
new file mode 100644
index 000000000..fe4816fe8
--- /dev/null
+++ b/easy_rec/python/layers/mask_net.py
@@ -0,0 +1,73 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+
+from easy_rec.python.layers import dnn
+from easy_rec.python.layers.common_layers import layer_norm
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class MaskBlock(object):
+  def __init__(self, mask_block_config):
+    self.mask_block_config = mask_block_config
+
+  def __call__(self, net, mask_input):
+    mask_input_dim = int(mask_input.shape[-1])
+    if self.mask_block_config.HasField('reduction_factor'):
+      aggregation_size = int(mask_input_dim * self.mask_block_config.reduction_factor)
+    elif self.mask_block_config.HasField('aggregation_size') is not None:
+      aggregation_size = self.mask_block_config.aggregation_size
+    else:
+      raise ValueError("Need one of reduction factor or aggregation size for MaskBlock.")
+
+    if self.mask_block_config.input_layer_norm:
+      input_name = net.name.replace(':', '_')
+      net = layer_norm(net, reuse=tf.AUTO_REUSE, name='ln_' + input_name)
+
+    # initializer = tf.initializers.variance_scaling()
+    initializer = tf.glorot_uniform_initializer()
+    mask = tf.layers.dense(mask_input, aggregation_size,
+                           activation=tf.nn.relu,
+                           kernel_initializer=initializer)
+    mask = tf.layers.dense(mask, net.shape[-1])
+    masked_net = net * mask
+
+    output_size = self.mask_block_config.output_size
+    hidden_layer_output = tf.layers.dense(masked_net, output_size)
+    return layer_norm(hidden_layer_output)
+
+
+class MaskNet(object):
+  def __init__(self, mask_net_config, name='mask_net'):
+    self.mask_net_config = mask_net_config
+    self.name = name
+
+  def __call__(self, inputs, is_training, l2_reg=None):
+    conf = self.mask_net_config
+    if conf.use_parallel:
+      mask_outputs = []
+      for block_conf in self.mask_net_config.mask_blocks:
+        mask_layer = MaskBlock(block_conf)
+        mask_outputs.append(mask_layer(mask_input=inputs, net=inputs))
+      all_mask_outputs = tf.concat(mask_outputs, axis=1)
+
+      if conf.HasField('mlp'):
+        mlp = dnn.DNN(conf.mlp, l2_reg, name='%s/mlp' % self.name, is_training=is_training)
+        output = mlp(all_mask_outputs)
+      else:
+        output = all_mask_outputs
+      return output
+    else:
+      net = inputs
+      for block_conf in self.mask_net_config.mask_blocks:
+        mask_layer = MaskBlock(block_conf)
+        net = mask_layer(net=net, mask_input=inputs)
+
+      if conf.HasField('mlp'):
+        mlp = dnn.DNN(conf.mlp, l2_reg, name='%s/mlp' % self.name, is_training=is_training)
+        output = mlp(net)
+      else:
+        output = net
+      return output
diff --git a/easy_rec/python/model/dbmtl.py b/easy_rec/python/model/dbmtl.py
index 3639bf029..e829ba57f 100644
--- a/easy_rec/python/model/dbmtl.py
+++ b/easy_rec/python/model/dbmtl.py
@@ -6,6 +6,8 @@
 from easy_rec.python.layers import dnn
 from easy_rec.python.layers import mmoe
 from easy_rec.python.layers import uniter
+from easy_rec.python.layers import fibinet
+from easy_rec.python.layers import mask_net
 from easy_rec.python.model.multi_task_model import MultiTaskModel
 from easy_rec.python.protos.dbmtl_pb2 import DBMTL as DBMTLConfig
 
@@ -37,6 +39,13 @@ def __init__(self,
                                          features,
                                          self._model_config.bottom_uniter,
                                          self._input_layer)
+    elif self._model_config.HasField('bottom_fibinet'):
+      self._fibinet_layer = fibinet.FiBiNetLayer(self._model_config.bottom_fibinet,
+                                                 features,
+                                                 self._input_layer)
+    elif self._model_config.HasField('bottom_mask_net'):
+      self._mask_net_layer = mask_net.MaskNet(self._model_config.bottom_mask_net)
+      self._features, _ = self._input_layer(self._feature_dict, 'all')
     else:
       self._features, _ = self._input_layer(self._feature_dict, 'all')
     self._init_towers(self._model_config.task_towers)
@@ -60,6 +69,10 @@ def build_predict_graph(self):
       bottom_fea = self._cmbf_layer(self._is_training, l2_reg=self._l2_reg)
     elif self._model_config.HasField('bottom_uniter'):
       bottom_fea = self._uniter_layer(self._is_training, l2_reg=self._l2_reg)
+    elif self._model_config.HasField('bottom_fibinet'):
+      bottom_fea = self._fibinet_layer('all', self._is_training, l2_reg=self._l2_reg)
+    elif self._model_config.HasField('bottom_mask_net'):
+      bottom_fea = self._mask_net_layer(self._features, self._is_training, l2_reg=self._l2_reg)
     elif self._model_config.HasField('bottom_dnn'):
       bottom_dnn = dnn.DNN(
           self._model_config.bottom_dnn,
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index 6483877b7..4a7ad6330 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -104,7 +104,8 @@ def build_input_layer(self, model_config, feature_configs):
         kernel_regularizer=self._l2_reg,
         variational_dropout_config=model_config.variational_dropout
         if model_config.HasField('variational_dropout') else None,
-        is_training=self._is_training)
+        is_training=self._is_training,
+        do_feature_normalize=model_config.do_feature_normalize)
 
   def get_sequence_encoding(self, group_name=None, is_training=True):
     if group_name is not None:
diff --git a/easy_rec/python/model/multi_task_model.py b/easy_rec/python/model/multi_task_model.py
index a6bd1b29d..06dc53f8a 100644
--- a/easy_rec/python/model/multi_task_model.py
+++ b/easy_rec/python/model/multi_task_model.py
@@ -124,7 +124,8 @@ def build_loss_graph(self):
             suffix='_%s' % tower_name)
         if strategy == self._base_model_config.Random:
           for loss_name in loss_dict.keys():
-            loss_dict[loss_name] = loss_dict[loss_name] * loss_weight_arr[offset]
+            loss_dict[
+                loss_name] = loss_dict[loss_name] * loss_weight_arr[offset]
         offset += 1
       else:
         for i, loss in enumerate(losses):
@@ -145,20 +146,24 @@ def build_loss_graph(self):
             elif strategy == self._base_model_config.Uncertainty:
               if loss.learn_loss_weight:
                 uncertainty = tf.Variable(
-                  0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
-                tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
-                if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
+                    0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
+                tf.summary.scalar('loss/%s_uncertainty' % loss_name,
+                                  uncertainty)
+                if loss.loss_type in {
+                    LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS
+                }:
                   loss_dict[loss_name] = 0.5 * tf.exp(
-                    -uncertainty) * loss_value + 0.5 * uncertainty
+                      -uncertainty) * loss_value + 0.5 * uncertainty
                 else:
                   loss_dict[loss_name] = tf.exp(
-                    -uncertainty) * loss_value + 0.5 * uncertainty
+                      -uncertainty) * loss_value + 0.5 * uncertainty
               else:
                 loss_dict[loss_name] = loss_value * loss.weight
             elif strategy == self._base_model_config.Random:
               loss_dict[loss_name] = loss_value * loss_weight_arr[i + offset]
             else:
-              raise ValueError("Unsupported loss weight strategy: " + strategy.Name)
+              raise ValueError('Unsupported loss weight strategy: ' +
+                               strategy.Name)
         offset += len(losses)
       self._loss_dict.update(loss_dict)
 
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index e4a38fa2d..4f4368b9f 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -228,7 +228,8 @@ def build_loss_graph(self):
           elif strategy == self._base_model_config.Random:
             loss_dict[loss_name] = loss_value * loss_weight[i]
           else:
-            raise ValueError("Unsupported loss weight strategy: " + strategy.Name)
+            raise ValueError('Unsupported loss weight strategy: ' +
+                             strategy.Name)
 
     self._loss_dict.update(loss_dict)
 
diff --git a/easy_rec/python/protos/dbmtl.proto b/easy_rec/python/protos/dbmtl.proto
index 2b1f981aa..5c7152ee1 100644
--- a/easy_rec/python/protos/dbmtl.proto
+++ b/easy_rec/python/protos/dbmtl.proto
@@ -4,12 +4,18 @@ package protos;
 import "easy_rec/python/protos/dnn.proto";
 import "easy_rec/python/protos/tower.proto";
 import "easy_rec/python/protos/layer.proto";
+import "easy_rec/python/protos/fibinet.proto";
+import "easy_rec/python/protos/masknet.proto";
 
 message DBMTL {
     // shared bottom cmbf layer
     optional CMBFTower bottom_cmbf = 101;
     // shared bottom uniter layer
     optional UniterTower bottom_uniter = 102;
+    // shared bottom fibinet layer
+    optional FiBiNetTower bottom_fibinet = 103;
+    // shared bottom mask net layer
+    optional MaskNet bottom_mask_net = 104;
     // shared bottom dnn layer
     optional DNN bottom_dnn = 1;
     // mmoe expert dnn layer definition
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 770611880..f28180e10 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -112,4 +112,6 @@ message EasyRecModel {
 
     // dnn layers after sequence feature
     optional DNN sequence_dnn = 17;
+
+    optional bool do_feature_normalize = 18;
 }
diff --git a/easy_rec/python/protos/fibinet.proto b/easy_rec/python/protos/fibinet.proto
new file mode 100644
index 000000000..b13fd7cba
--- /dev/null
+++ b/easy_rec/python/protos/fibinet.proto
@@ -0,0 +1,15 @@
+syntax = "proto2";
+package protos;
+
+import "easy_rec/python/protos/dnn.proto";
+
+message FiBiNetTower {
+    required string bilinear_type = 1 [default = 'interaction'];
+    required bool use_bilinear_plus = 2 [default = true];
+    required uint32 bilinear_output_units = 3;
+
+    required uint32 senet_reduction_ratio = 4 [default = 3];
+    optional uint32 num_senet_squeeze_group = 5 [default = 2];
+
+    required DNN mlp = 6;
+}
diff --git a/easy_rec/python/protos/masknet.proto b/easy_rec/python/protos/masknet.proto
new file mode 100644
index 000000000..c9b0b703a
--- /dev/null
+++ b/easy_rec/python/protos/masknet.proto
@@ -0,0 +1,17 @@
+syntax = "proto2";
+package protos;
+
+import "easy_rec/python/protos/dnn.proto";
+
+message MaskBlock {
+    optional float reduction_factor = 1;
+    required uint32 output_size = 2;
+    optional uint32 aggregation_size = 3;
+    optional bool input_layer_norm = 4 [default = true];
+}
+
+message MaskNet {
+    repeated MaskBlock mask_blocks = 1;
+    required bool use_parallel = 2 [default = true];
+    optional DNN mlp = 3;
+}
\ No newline at end of file
diff --git a/easy_rec/python/protos/variational_dropout.proto b/easy_rec/python/protos/variational_dropout.proto
index c643b3d2e..e76a0fb3b 100644
--- a/easy_rec/python/protos/variational_dropout.proto
+++ b/easy_rec/python/protos/variational_dropout.proto
@@ -13,5 +13,9 @@ message VariationalDropoutLayer {
     optional float feature_complexity_weight = 4 [default = 1.0];
     optional float feature_dimension_weight = 5 [default = 1e-2];
     optional float feature_cardinality_weight = 6 [default = 1e-7];
-    optional uint32 fine_tune_use_top_k_features = 7;
+    // temperature
+    optional float temperature = 7 [default = 0.1];
+
+    optional float min_keep_ratio = 8 [default = 1e-3];
+    optional float max_keep_ratio = 9 [default = 1.0];
 }
diff --git a/easy_rec/python/tools/feature_selection.py b/easy_rec/python/tools/feature_selection.py
index 065993652..bd31fef9b 100644
--- a/easy_rec/python/tools/feature_selection.py
+++ b/easy_rec/python/tools/feature_selection.py
@@ -423,7 +423,7 @@ def _process_config(self, feature_importance_map, white_feature_group):
     for feature_config in config_util.get_compatible_feature_configs(config):
       feature_name = feature_config.input_names[0]
       if feature_config.HasField('feature_name'):
-          feature_name = feature_config.feature_name
+        feature_name = feature_config.feature_name
       if feature_name in excluded_features:
         feature_config.feature_type = FeatureConfig.FeatureType.ConstFeature
 
diff --git a/easy_rec/python/tools/view_saved_model.py b/easy_rec/python/tools/view_saved_model.py
index a3c01909b..022bcf1aa 100644
--- a/easy_rec/python/tools/view_saved_model.py
+++ b/easy_rec/python/tools/view_saved_model.py
@@ -4,8 +4,8 @@
 import logging
 
 from google.protobuf import text_format
-from tensorflow.python.platform.gfile import GFile
 from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.platform.gfile import GFile
 
 logging.basicConfig(
     format='[%(levelname)s] %(asctime)s %(filename)s:%(lineno)d : %(message)s',
@@ -15,7 +15,8 @@
   parser = argparse.ArgumentParser()
   parser.add_argument(
       '--input', type=str, default=None, help='saved model path')
-  parser.add_argument('--output', type=str, default=None, help='saved model save path')    
+  parser.add_argument(
+      '--output', type=str, default=None, help='saved model save path')
   args = parser.parse_args()
 
   assert args.input is not None and args.output is not None
@@ -29,7 +30,7 @@
   else:
     with GFile(args.input, 'r') as fin:
       text_format.Merge(fin.read(), saved_model)
-   
+
   if args.output.endswith('.pbtxt'):
     with GFile(args.output, 'w') as fout:
       fout.write(text_format.MessageToString(saved_model, as_utf8=True))

From 48601c7f3559f9456c8cca3c436de9317d81dcc5 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 9 Jun 2023 18:02:06 +0800
Subject: [PATCH 28/54] [feat]: add backbone network

---
 easy_rec/python/builders/loss_builder.py      |   9 +-
 easy_rec/python/compat/array_ops.py           | 229 ++++++++++++++++++
 .../compat/feature_column/feature_column.py   |  23 +-
 easy_rec/python/layers/backbone.py            | 195 +++++++++++++++
 easy_rec/python/layers/common_layers.py       |  68 +++---
 easy_rec/python/layers/dnn.py                 |  11 +-
 easy_rec/python/layers/fibinet.py             |  49 ++--
 easy_rec/python/layers/fscd_layer.py          |  25 +-
 easy_rec/python/layers/mask_net.py            |  58 +++--
 easy_rec/python/layers/numerical_embedding.py |  39 +++
 easy_rec/python/loss/info_nce_loss.py         |  41 ++++
 easy_rec/python/loss/jrc_loss.py              |  57 ++++-
 easy_rec/python/model/dbmtl.py                |  92 +++----
 easy_rec/python/model/easy_rec_model.py       |  35 ++-
 easy_rec/python/protos/backbone.proto         |  44 ++++
 easy_rec/python/protos/cmbf.proto             |  43 +++-
 easy_rec/python/protos/dbmtl.proto            |  20 +-
 easy_rec/python/protos/easy_rec_model.proto   |   7 +-
 easy_rec/python/protos/fibinet.proto          |  22 +-
 easy_rec/python/protos/layer.proto            |  65 -----
 easy_rec/python/protos/loss.proto             |   2 +
 easy_rec/python/protos/masknet.proto          |   2 +-
 easy_rec/python/protos/uniter.proto           |  26 +-
 easy_rec/python/utils/dag.py                  | 205 ++++++++++++++++
 setup.cfg                                     |   2 +-
 25 files changed, 1114 insertions(+), 255 deletions(-)
 create mode 100644 easy_rec/python/compat/array_ops.py
 create mode 100644 easy_rec/python/layers/backbone.py
 create mode 100644 easy_rec/python/layers/numerical_embedding.py
 create mode 100644 easy_rec/python/loss/info_nce_loss.py
 create mode 100644 easy_rec/python/protos/backbone.proto
 create mode 100644 easy_rec/python/utils/dag.py

diff --git a/easy_rec/python/builders/loss_builder.py b/easy_rec/python/builders/loss_builder.py
index e1b32fde1..ec4ab57c8 100644
--- a/easy_rec/python/builders/loss_builder.py
+++ b/easy_rec/python/builders/loss_builder.py
@@ -41,16 +41,17 @@ def build(loss_type,
     return tf.losses.mean_squared_error(
         labels=label, predictions=pred, weights=loss_weight, **kwargs)
   elif loss_type == LossType.JRC_LOSS:
-    alpha = 0.5 if loss_param is None else loss_param.alpha
-    auto = False if loss_param is None else not loss_param.HasField('alpha')
     session = kwargs.get('session_ids', None)
+    if loss_param is None:
+      return jrc_loss(label, pred, session, name=loss_name)
     return jrc_loss(
         label,
         pred,
         session,
-        alpha,
-        auto_weight=auto,
+        loss_param.alpha,
+        loss_weight_strategy=loss_param.loss_weight_strategy,
         sample_weights=loss_weight,
+        same_label_loss=loss_param.same_label_loss,
         name=loss_name)
   elif loss_type == LossType.PAIR_WISE_LOSS:
     session = kwargs.get('session_ids', None)
diff --git a/easy_rec/python/compat/array_ops.py b/easy_rec/python/compat/array_ops.py
new file mode 100644
index 000000000..3e8929ceb
--- /dev/null
+++ b/easy_rec/python/compat/array_ops.py
@@ -0,0 +1,229 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import gen_math_ops
+
+
+def convert_to_int_tensor(tensor, name, dtype=tf.int32):
+  """Converts the given value to an integer Tensor."""
+  tensor = ops.convert_to_tensor(tensor, name=name, preferred_dtype=dtype)
+  if tensor.dtype.is_integer:
+    tensor = gen_math_ops.cast(tensor, dtype)
+  else:
+    raise TypeError('%s must be an integer tensor; dtype=%s' %
+                    (name, tensor.dtype))
+  return tensor
+
+
+def _with_nonzero_rank(data):
+  """If `data` is scalar, then add a dimension; otherwise return as-is."""
+  if data.shape.ndims is not None:
+    if data.shape.ndims == 0:
+      return tf.stack([data])
+    else:
+      return data
+  else:
+    data_shape = tf.shape(data)
+    data_ndims = tf.rank(data)
+    return tf.reshape(data, tf.concat([[1], data_shape], axis=0)[-data_ndims:])
+
+
+def get_positive_axis(axis, ndims):
+  """Validate an `axis` parameter, and normalize it to be positive.
+
+  If `ndims` is known (i.e., not `None`), then check that `axis` is in the
+  range `-ndims <= axis < ndims`, and return `axis` (if `axis >= 0`) or
+  `axis + ndims` (otherwise).
+  If `ndims` is not known, and `axis` is positive, then return it as-is.
+  If `ndims` is not known, and `axis` is negative, then report an error.
+
+  Args:
+    axis: An integer constant
+    ndims: An integer constant, or `None`
+
+  Returns:
+    The normalized `axis` value.
+
+  Raises:
+    ValueError: If `axis` is out-of-bounds, or if `axis` is negative and
+      `ndims is None`.
+  """
+  if not isinstance(axis, int):
+    raise TypeError('axis must be an int; got %s' % type(axis).__name__)
+  if ndims is not None:
+    if 0 <= axis < ndims:
+      return axis
+    elif -ndims <= axis < 0:
+      return axis + ndims
+    else:
+      raise ValueError('axis=%s out of bounds: expected %s<=axis<%s' %
+                       (axis, -ndims, ndims))
+  elif axis < 0:
+    raise ValueError('axis may only be negative if ndims is statically known.')
+  return axis
+
+
+def tile_one_dimension(data, axis, multiple):
+  """Tiles a single dimension of a tensor."""
+  # Assumes axis is a nonnegative int.
+  if data.shape.ndims is not None:
+    multiples = [1] * data.shape.ndims
+    multiples[axis] = multiple
+  else:
+    ones_value = tf.ones(tf.rank(data), tf.int32)
+    multiples = tf.concat(
+        [ones_value[:axis], [multiple], ones_value[axis + 1:]], axis=0)
+  return tf.tile(data, multiples)
+
+
+def _all_dimensions(x):
+  """Returns a 1D-tensor listing all dimensions in x."""
+  # Fast path: avoid creating Rank and Range ops if ndims is known.
+  if isinstance(x, ops.Tensor) and x.get_shape().ndims is not None:
+    return constant_op.constant(np.arange(x.get_shape().ndims), dtype=tf.int32)
+  if (isinstance(x, sparse_tensor.SparseTensor) and
+      x.dense_shape.get_shape().is_fully_defined()):
+    r = x.dense_shape.get_shape().dims[0].value  # sparse.dense_shape is 1-D.
+    return constant_op.constant(np.arange(r), dtype=tf.int32)
+
+  # Otherwise, we rely on `range` and `rank` to do the right thing at runtime.
+  return gen_math_ops._range(0, tf.rank(x), 1)
+
+
+# This op is intended to exactly match the semantics of numpy.repeat, with
+# one exception: numpy.repeat has special (and somewhat non-intuitive) behavior
+# when axis is not specified.  Rather than implement that special behavior, we
+# simply make `axis` be a required argument.
+#
+# External (OSS) `tf.repeat` feature request:
+# https://github.com/tensorflow/tensorflow/issues/8246
+def repeat_with_axis(data, repeats, axis, name=None):
+  """Repeats elements of `data`.
+
+  Args:
+    data: An `N`-dimensional tensor.
+    repeats: A 1-D integer tensor specifying how many times each element in
+      `axis` should be repeated.  `len(repeats)` must equal `data.shape[axis]`.
+      Supports broadcasting from a scalar value.
+    axis: `int`.  The axis along which to repeat values.  Must be less than
+      `max(N, 1)`.
+    name: A name for the operation.
+
+  Returns:
+    A tensor with `max(N, 1)` dimensions.  Has the same shape as `data`,
+    except that dimension `axis` has size `sum(repeats)`.
+  #### Examples:
+    ```python
+    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
+    ['a', 'a', 'a', 'c', 'c']
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
+    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
+    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    ```
+  """
+  if not isinstance(axis, int):
+    raise TypeError('axis must be an int; got %s' % type(axis).__name__)
+
+  with ops.name_scope(name, 'Repeat', [data, repeats]):
+    data = ops.convert_to_tensor(data, name='data')
+    repeats = convert_to_int_tensor(repeats, name='repeats')
+    repeats.shape.with_rank_at_most(1)
+
+    # If `data` is a scalar, then upgrade it to a vector.
+    data = _with_nonzero_rank(data)
+    data_shape = tf.shape(data)
+
+    # If `axis` is negative, then convert it to a positive value.
+    axis = get_positive_axis(axis, data.shape.ndims)
+
+    # Check data Tensor shapes.
+    if repeats.shape.ndims == 1:
+      data.shape.dims[axis].assert_is_compatible_with(repeats.shape[0])
+
+    # If we know that `repeats` is a scalar, then we can just tile & reshape.
+    if repeats.shape.ndims == 0:
+      expanded = tf.expand_dims(data, axis + 1)
+      tiled = tile_one_dimension(expanded, axis + 1, repeats)
+      result_shape = tf.concat([data_shape[:axis], [-1], data_shape[axis + 1:]],
+                               axis=0)
+      return tf.reshape(tiled, result_shape)
+
+    # Broadcast the `repeats` tensor so rank(repeats) == axis + 1.
+    if repeats.shape.ndims != axis + 1:
+      repeats_shape = tf.shape(repeats)
+      repeats_ndims = tf.rank(repeats)
+      broadcast_shape = tf.concat(
+          [data_shape[:axis + 1 - repeats_ndims], repeats_shape], axis=0)
+      repeats = tf.broadcast_to(repeats, broadcast_shape)
+      repeats.set_shape([None] * (axis + 1))
+
+    # Create a "sequence mask" based on `repeats`, where slices across `axis`
+    # contain one `True` value for each repetition.  E.g., if
+    # `repeats = [3, 1, 2]`, then `mask = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]`.
+    max_repeat = gen_math_ops.maximum(
+        0, gen_math_ops._max(repeats, _all_dimensions(repeats)))
+    mask = tf.sequence_mask(repeats, max_repeat)
+
+    # Add a new dimension around each value that needs to be repeated, and
+    # then tile that new dimension to match the maximum number of repetitions.
+    expanded = tf.expand_dims(data, axis + 1)
+    tiled = tile_one_dimension(expanded, axis + 1, max_repeat)
+
+    # Use `boolean_mask` to discard the extra repeated values.  This also
+    # flattens all dimensions up through `axis`.
+    masked = tf.boolean_mask(tiled, mask)
+
+    # Reshape the output tensor to add the outer dimensions back.
+    if axis == 0:
+      result = masked
+    else:
+      result_shape = tf.concat([data_shape[:axis], [-1], data_shape[axis + 1:]],
+                               axis=0)
+      result = tf.reshape(masked, result_shape)
+
+    # Preserve shape information.
+    if data.shape.ndims is not None:
+      new_axis_size = 0 if repeats.shape[0] == 0 else None
+      result.set_shape(data.shape[:axis].concatenate(
+          [new_axis_size]).concatenate(data.shape[axis + 1:]))
+
+    return result
+
+
+def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
+  """Repeat elements of `input`
+
+  Args:
+    input: An `N`-dimensional Tensor.
+    repeats: An 1-D `int` Tensor. The number of repetitions for each element.
+      repeats is broadcasted to fit the shape of the given axis. `len(repeats)`
+      must equal `input.shape[axis]` if axis is not None.
+    axis: An int. The axis along which to repeat values. By default (axis=None),
+      use the flattened input array, and return a flat output array.
+    name: A name for the operation.
+
+  Returns:
+    A Tensor which has the same shape as `input`, except along the given axis.
+      If axis is None then the output array is flattened to match the flattened
+      input array.
+  #### Examples:
+    ```python
+    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
+    ['a', 'a', 'a', 'c', 'c']
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
+    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
+    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    >>> repeat(3, repeats=4)
+    [3, 3, 3, 3]
+    >>> repeat([[1,2], [3,4]], repeats=2)
+    [1, 1, 2, 2, 3, 3, 4, 4]
+    ```
+  """
+  if axis is None:
+    input = tf.reshape(input, [-1])
+    axis = 0
+  return repeat_with_axis(input, repeats, axis, name)
diff --git a/easy_rec/python/compat/feature_column/feature_column.py b/easy_rec/python/compat/feature_column/feature_column.py
index 09d791386..7d8419528 100644
--- a/easy_rec/python/compat/feature_column/feature_column.py
+++ b/easy_rec/python/compat/feature_column/feature_column.py
@@ -211,15 +211,19 @@ def _get_logits():  # pylint: disable=missing-docstring
         output_tensor = array_ops.reshape(
             tensor, shape=(batch_size, num_elements))
         if do_normalize:
-          from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn, NumericColumn, \
-            WeightedCategoricalColumn
+          from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn,\
+            NumericColumn, WeightedCategoricalColumn
           from tensorflow.python.layers.normalization import batch_normalization
-          if isinstance(column, EmbeddingColumn) or isinstance(column, _SharedEmbeddingColumn):
+          if isinstance(column, EmbeddingColumn) or isinstance(
+              column, _SharedEmbeddingColumn):
             fc = column.categorical_column
-            if isinstance(fc, WeightedCategoricalColumn) and fc.weight_feature_key.endswith('_raw_proj_val'):
-              output_tensor = layer_norm(output_tensor, name='ln_' + column.name)
+            if isinstance(fc, WeightedCategoricalColumn
+                          ) and fc.weight_feature_key.endswith('_raw_proj_val'):
+              output_tensor = layer_norm(
+                  output_tensor, name='ln_' + column.name)
             else:
-              output_tensor = batch_normalization(output_tensor, name='bn_'+column.name)
+              output_tensor = batch_normalization(
+                  output_tensor, name='bn_' + column.name)
           elif isinstance(column, NumericColumn) and int(column.shape[-1]) > 1:
             output_tensor = layer_norm(output_tensor, name='ln_' + column.name)
         output_tensors.append(output_tensor)
@@ -2552,9 +2556,10 @@ def raw_name(self):
 
   @property
   def cardinality(self):
-    from easy_rec.python.compat.feature_column.feature_column_v2 import HashedCategoricalColumn, BucketizedColumn, \
-      WeightedCategoricalColumn, SequenceWeightedCategoricalColumn, CrossedColumn, IdentityCategoricalColumn, \
-      VocabularyListCategoricalColumn, VocabularyFileCategoricalColumn
+    from easy_rec.python.compat.feature_column.feature_column_v2 import HashedCategoricalColumn,\
+      BucketizedColumn, WeightedCategoricalColumn, SequenceWeightedCategoricalColumn, \
+      CrossedColumn, IdentityCategoricalColumn, VocabularyListCategoricalColumn,\
+      VocabularyFileCategoricalColumn
 
     fc = self.categorical_column
     if isinstance(fc, HashedCategoricalColumn) or isinstance(fc, CrossedColumn):
diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
new file mode 100644
index 000000000..285ff80c5
--- /dev/null
+++ b/easy_rec/python/layers/backbone.py
@@ -0,0 +1,195 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+
+import tensorflow as tf
+
+from easy_rec.python.utils.dag import DAG
+from easy_rec.python.layers import dnn
+from easy_rec.python.layers.common_layers import layer_norm, SENet
+from easy_rec.python.layers.numerical_embedding import NumericalEmbedding
+from easy_rec.python.layers.fibinet import FiBiNetLayer
+from easy_rec.python.layers.mask_net import MaskNet
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class EnhancedInputLayer(object):
+  def __init__(self, config, input_layer, feature_dict):
+    if config.do_batch_norm and config.do_layer_norm:
+      raise ValueError('can not do batch norm and layer norm for input layer at the same time')
+    self._config = config
+    self._input_layer = input_layer
+    self._feature_dict = feature_dict
+
+  def __call__(self, feature_group, is_training, *args, **kwargs):
+    features, feature_list = self._input_layer(self._feature_dict, feature_group)
+    num_features = len(feature_list)
+
+    do_feature_dropout = 0.0 < self._config.feature_dropout_rate < 1.0
+    if self._config.output_feature_list or do_feature_dropout:
+      if self._config.do_layer_norm or self._config.do_batch_norm:
+        for i in range(num_features):
+          fea = feature_list[i]
+          if self._config.do_batch_norm:
+            fea = tf.layers.batch_normalization(fea, training=is_training)
+          elif self._config.do_layer_norm:
+            fea = layer_norm(fea)
+          feature_list[i] = fea
+    elif self._config.do_batch_norm:
+      features = tf.layers.batch_normalization(features, training=is_training)
+    elif self._config.do_layer_norm:
+      features = layer_norm(features)
+
+    if do_feature_dropout and is_training:
+      keep_prob = 1.0 - self._config.feature_dropout_rate
+      bern = tf.distributions.Bernoulli(probs=keep_prob)
+      mask = bern.sample(num_features)
+      for i in range(num_features):
+        fea = tf.div(feature_list[i], keep_prob) * mask[i]
+        feature_list[i] = fea
+      features = tf.concat(feature_list, axis=-1)
+
+    do_dropout = 0.0 < self._config.dropout_rate < 1.0
+    if self._config.output_feature_list:
+      if do_dropout:
+        for i in range(num_features):
+          fea = feature_list[i]
+          fea = tf.layers.dropout(fea, self._config.dropout_rate, training=is_training)
+          feature_list[i] = fea
+      return feature_list
+    if do_dropout:
+      return tf.layers.dropout(features, self._config.dropout_rate, training=is_training)
+    return features
+
+
+class Backbone(object):
+  def __init__(self, config, model, features, input_layer, l2_reg=None):
+    self._model = model
+    self._config = config
+    self._features = features
+    self._input_layer = input_layer
+    self._l2_reg = l2_reg
+    self._dag = DAG()
+    self._name_to_blocks = {}
+    for block in config.blocks:
+      self._name_to_blocks[block.name] = block
+      self._dag.add_node(block.name)
+    assert len(self._name_to_blocks) > 0, 'there must be more than one block in backbone'
+    for block in config.blocks:
+      assert len(block.inputs) > 0, 'there is no input for block: %s' % block.name
+      for node in block.inputs:
+        if node in self._name_to_blocks:
+          self._dag.add_edge(node, block.name)
+
+  def block_input(self, config, block_outputs):
+    inputs = []
+    for input_name in config.inputs:
+      if input_name in block_outputs:
+        input_feature = block_outputs[input_name]
+      else:
+        input_feature, _ = self._input_layer(self._features, input_name)
+      inputs.append(input_feature)
+    return concat_inputs(inputs, config.name)
+
+  def __call__(self, is_training, *args, **kwargs):
+    block_outputs = {}
+    blocks = self._dag.topological_sort()
+    logging.info("backbone topological: " + ','.join(blocks))
+    for block in blocks:
+      config = self._name_to_blocks[block]
+      layer = config.WhichOneof('layer')
+      if layer == 'input_layer':
+        assert len(config.inputs) == 1, 'only one input needed for input_layer: ' + block.name
+        conf = config.input_layer
+        input_layer = EnhancedInputLayer(conf, self._input_layer, self._features)
+        output = input_layer(config.inputs[0], is_training)
+        block_outputs[block] = output
+      elif layer == 'numerical_embedding':
+        conf = config.numerical_embedding
+        num_emb = NumericalEmbedding(conf.embedding_dim, stddev=conf.coef_stddev,
+                                     scope='%s_numerical_embedding' % block)
+        input_feature = self.block_input(config, block_outputs)
+        block_outputs[block] = num_emb(input_feature)
+      elif layer == 'mlp':
+        mlp = dnn.DNN(
+          config.mlp,
+          self._l2_reg,
+          name='%s_mlp' % block,
+          is_training=is_training)
+        input_feature = self.block_input(config, block_outputs)
+        output = mlp(input_feature)
+        block_outputs[block] = output
+      elif layer == 'sequence_encoder':
+        block_outputs[block] = self.sequence_encoder(config, is_training)
+      elif layer == 'masknet':
+        conf = config.masknet
+        mask_net = MaskNet(
+          conf,
+          name=block,
+          reuse=tf.AUTO_REUSE)
+        input_feature = self.block_input(config, block_outputs)
+        output = mask_net(
+          input_feature, is_training, l2_reg=self._l2_reg)
+        block_outputs[block] = output
+      elif layer == 'senet':
+        conf = config.senet
+        senet = SENet(conf, name=block)
+        input_feature = self.block_input(config, block_outputs)
+        output = senet(input_feature)
+        block_outputs[block] = output
+      elif layer == 'fibinet':
+        conf = config.fibinet
+        fibinet = FiBiNetLayer(conf, name=block)
+        input_feature = self.block_input(config, block_outputs)
+        output = fibinet(input_feature, is_training, l2_reg=self._l2_reg)
+        block_outputs[block] = output
+      else:
+        raise ValueError('Unsupported backbone layer:' + layer)
+
+    temp = []
+    for output in self._config.concat_blocks:
+      if output in block_outputs:
+        temp.append(block_outputs[output])
+      else:
+        raise ValueError('No output `%s` of backbone to be concat' % output)
+
+    output = concat_inputs(temp)
+    if self._config.HasField('top_mlp'):
+      final_dnn = dnn.DNN(
+        self._config.top_mlp,
+        self._l2_reg,
+        name='backbone_top_mlp',
+        is_training=is_training)
+      output = final_dnn(output)
+    return output
+
+  def sequence_encoder(self, config, is_training):
+    encodings = []
+    for seq_input in config.inputs:
+      encoding = self._model.get_sequence_encoding(seq_input, is_training)
+      encodings.append(encoding)
+    encoding = concat_inputs(encodings)
+    conf = config.sequence_encoder
+    if conf.HasField('mlp'):
+      sequence_dnn = dnn.DNN(
+        conf.mlp,
+        self._l2_reg,
+        name='%s_seq_dnn' % config.name,
+        is_training=is_training)
+      encoding = sequence_dnn(encoding)
+    return encoding
+
+
+def concat_inputs(inputs, msg=''):
+  if len(inputs) > 1:
+    if type(inputs[0]) == list:
+      from functools import reduce
+      return reduce(lambda x, y: x + y, inputs)
+    return tf.concat(inputs, axis=-1)
+  if len(inputs) == 1:
+    return inputs[0]
+  raise ValueError('no inputs to be concat:' + msg)
+
+
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index e3bb65f64..892e75550 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -70,49 +70,52 @@ def text_cnn(x,
 def layer_norm(input_tensor, name=None, reuse=None):
   """Run layer normalization on the last dimension of the tensor."""
   return tf_layer_norm(
-      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, reuse=reuse, scope=name)
+      inputs=input_tensor,
+      begin_norm_axis=-1,
+      begin_params_axis=-1,
+      reuse=reuse,
+      scope=name)
 
 
 class SENet(object):
+  """SENet+ Layer used in FiBiNET，支持不同field的embedding dimension不等.
+
+  arxiv: 2209.05016
   """
-    SENet+ Layer，支持不同field的embedding dimension不等
-    arxiv: 2209.05016
-    """
 
-  def __init__(self, reduction_ratio, num_groups, name='SENet'):
-    self.reduction_ratio = reduction_ratio
-    self.num_groups = num_groups
+  def __init__(self, config, name='SENet'):
+    self.config = config
     self.name = name
 
   def __call__(self, embedding_list):
-    """
-
-      :param embedding_list: [embedding_1,...,embedding_i,...,embedding_f]，f为field的数目，embedding_i is [bs, dim]
-      :return:
-      """
-    print("SENET layer with %d inputs" % len(embedding_list))
+    """embedding_list:  - A list of 2D tensor with shape: ``(batch_size,embedding_size)``."""
+    print('SENET layer with %d inputs' % len(embedding_list))
+    g = self.config.num_squeeze_group
     for emb in embedding_list:
       assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
+      dim = int(emb.shape[-1])
+      assert dim >= g and dim % g == 0, 'field embedding dimension %d must be divisible by %d' % (
+          dim, g)
 
     field_size = len(embedding_list)
     feature_size_list = [emb.shape.as_list()[-1] for emb in embedding_list]
 
     # Squeeze
-    g = self.num_groups
     # embedding dimension 必须能被 g 整除
     group_embs = [
-        tf.reshape(emb, [-1, g, tf.shape(emb)[-1] // g])
+        tf.reshape(emb, [-1, g, int(emb.shape[-1]) // g])
         for emb in embedding_list
     ]
 
     squeezed = []
     for emb in group_embs:
-      squeezed.append(tf.reduce_max(emb, axis=-1))
-      squeezed.append(tf.reduce_mean(emb, axis=-1))
+      squeezed.append(tf.reduce_max(emb, axis=-1))  # [B, g]
+      squeezed.append(tf.reduce_mean(emb, axis=-1))  # [B, g]
     z = tf.concat(squeezed, axis=1)  # [bs, field_size * num_groups * 2]
 
     # Excitation
-    reduction_size = max(1, field_size * g * 2 // self.reduction_ratio)
+    r = self.config.reduction_ratio
+    reduction_size = max(1, field_size * g * 2 // r)
 
     initializer = tf.glorot_normal_initializer()
     a1 = tf.layers.dense(
@@ -121,18 +124,24 @@ def __call__(self, embedding_list):
         kernel_initializer=initializer,
         activation=tf.nn.relu,
         name='%s/W1' % self.name)
-    a2 = tf.layers.dense(
+    weights = tf.layers.dense(
         a1,
         sum(feature_size_list),
         kernel_initializer=initializer,
         name='%s/W2' % self.name)
 
-    # Re-weight & Fuse
-    a = tf.split(a2, feature_size_list, axis=1)
-    senet_like_embeddings = [
-        layer_norm(emb * w + emb) for emb, w in zip(embedding_list, a)
-    ]
-    return tf.concat(senet_like_embeddings, axis=-1)
+    # Re-weight
+    inputs = tf.concat(embedding_list, axis=-1)
+    output = inputs * weights
+
+    # Fuse, add skip-connection
+    if self.config.use_skip_connection:
+      output += inputs
+
+    # Layer Normalization
+    if self.config.use_output_layer_norm:
+      output = layer_norm(output)
+    return output
 
 
 def _full_interaction(v_i, v_j):
@@ -170,9 +179,10 @@ def __init__(self,
       self.func = tf.multiply
 
   def __call__(self, embeddings):
-    print("Bilinear Layer with %d inputs" % len(embeddings))
+    print('Bilinear Layer with %d inputs' % len(embeddings))
     if len(embeddings) > 200:
-      logging.warn("There are too many inputs for bilinear layer: %d" % len(embeddings))
+      logging.warn('There are too many inputs for bilinear layer: %d' %
+                   len(embeddings))
     equal_dim = True
     _dim = embeddings[0].shape[-1]
     for emb in embeddings:
@@ -180,7 +190,9 @@ def __call__(self, embeddings):
       if emb.shape[-1] != _dim:
         equal_dim = False
     if not equal_dim and self.bilinear_type != 'interaction':
-      raise ValueError('all embedding dimensions must be same when use bilinear type: interaction')
+      raise ValueError(
+          'all embedding dimensions must be same when not use bilinear type: interaction'
+      )
     dim = int(_dim)
 
     field_size = len(embeddings)
diff --git a/easy_rec/python/layers/dnn.py b/easy_rec/python/layers/dnn.py
index 7a57f5661..e09891845 100644
--- a/easy_rec/python/layers/dnn.py
+++ b/easy_rec/python/layers/dnn.py
@@ -18,7 +18,8 @@ def __init__(self,
                name='dnn',
                is_training=False,
                last_layer_no_activation=False,
-               last_layer_no_batch_norm=False):
+               last_layer_no_batch_norm=False,
+               reuse=None):
     """Initializes a `DNN` Layer.
 
     Args:
@@ -28,6 +29,7 @@ def __init__(self,
       is_training: train phase or not, impact batch_norm and dropout
       last_layer_no_activation: in last layer, use or not use activation
       last_layer_no_batch_norm: in last layer, use or not use batch norm
+      reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
     """
     self._config = dnn_config
     self._l2_reg = l2_reg
@@ -38,6 +40,7 @@ def __init__(self,
         self._config.activation, training=is_training)
     self._last_layer_no_activation = last_layer_no_activation
     self._last_layer_no_batch_norm = last_layer_no_batch_norm
+    self._reuse = reuse
 
   @property
   def hidden_units(self):
@@ -59,14 +62,16 @@ def __call__(self, deep_fea, hidden_layer_feature_output=False):
           units=unit,
           kernel_regularizer=self._l2_reg,
           activation=None,
-          name='%s/dnn_%d' % (self._name, i))
+          name='%s/dnn_%d' % (self._name, i),
+          reuse=self._reuse)
       if self._config.use_bn and ((i + 1 < hidden_units_len) or
                                   not self._last_layer_no_batch_norm):
         deep_fea = tf.layers.batch_normalization(
             deep_fea,
             training=self._is_training,
             trainable=True,
-            name='%s/dnn_%d/bn' % (self._name, i))
+            name='%s/dnn_%d/bn' % (self._name, i),
+            reuse=self._reuse)
       if (i + 1 < hidden_units_len) or not self._last_layer_no_activation:
         deep_fea = self.activation(
             deep_fea, name='%s/dnn_%d/act' % (self._name, i))
diff --git a/easy_rec/python/layers/fibinet.py b/easy_rec/python/layers/fibinet.py
index 9a419e004..d112561ff 100644
--- a/easy_rec/python/layers/fibinet.py
+++ b/easy_rec/python/layers/fibinet.py
@@ -1,10 +1,10 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import tensorflow as tf
-from easy_rec.python.layers.common_layers import SENet
-from easy_rec.python.layers.common_layers import BiLinear
-from easy_rec.python.layers import dnn
 
+from easy_rec.python.layers import dnn
+from easy_rec.python.layers.common_layers import BiLinear
+from easy_rec.python.layers.common_layers import SENet
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -18,26 +18,25 @@ class FiBiNetLayer(object):
   https://arxiv.org/pdf/2209.05016.pdf
   """
 
-  def __init__(self, fibinet_config, features, input_layer):
+  def __init__(self, fibinet_config, name='fibinet'):
     self._config = fibinet_config
-    self._input_layer = input_layer
-    self._features = features
+    self.name = name
 
-  def __call__(self, group_name, is_training, l2_reg=0, *args, **kwargs):
+  def __call__(self, inputs, is_training, l2_reg=None, *args, **kwargs):
     feature_list = []
-    _, group_features = self._input_layer(self._features, group_name)
-    senet = SENet(reduction_ratio=self._config.senet_reduction_ratio,
-                       num_groups=self._config.num_senet_squeeze_group,
-                       name='%s_senet' % group_name)
-    senet_output = senet(group_features)
+
+    senet = SENet(self._config.senet, name='%s_senet' % self.name)
+    senet_output = senet(inputs)
     feature_list.append(senet_output)
 
-    if self._config.bilinear_type != 'none':
-      bilinear = BiLinear(output_size=self._config.bilinear_output_units,
-                          bilinear_type=self._config.bilinear_type,
-                          bilinear_plus=self._config.use_bilinear_plus,
-                          name='%s_bilinear' % group_name)
-      bilinear_output = bilinear(group_features)
+    if self._config.HasField('bilinear'):
+      conf = self._config.bilinear
+      bilinear = BiLinear(
+          output_size=conf.output_units,
+          bilinear_type=conf.type,
+          bilinear_plus=conf.use_plus,
+          name='%s_bilinear' % self.name)
+      bilinear_output = bilinear(inputs)
       feature_list.append(bilinear_output)
 
     if len(feature_list) > 1:
@@ -45,9 +44,11 @@ def __call__(self, group_name, is_training, l2_reg=0, *args, **kwargs):
     else:
       feature = feature_list[0]
 
-    final_dnn = dnn.DNN(
-      self._config.mlp,
-      l2_reg,
-      name='%s_fibinet_mlp' % group_name,
-      is_training=is_training)
-    return final_dnn(feature)
+    if self._config.HasField('mlp'):
+      final_dnn = dnn.DNN(
+        self._config.mlp,
+        l2_reg,
+        name='%s_fibinet_mlp' % self.name,
+        is_training=is_training)
+      feature = final_dnn(feature)
+    return feature
diff --git a/easy_rec/python/layers/fscd_layer.py b/easy_rec/python/layers/fscd_layer.py
index ec115f547..daccf750e 100644
--- a/easy_rec/python/layers/fscd_layer.py
+++ b/easy_rec/python/layers/fscd_layer.py
@@ -35,14 +35,14 @@ def sigmoid(x):
 
 def get_feature_importance(pipeline_config, feature_group_name=None):
   assert pipeline_config.model_config.HasField(
-    'variational_dropout'), 'variational_dropout must be in model_config'
+      'variational_dropout'), 'variational_dropout must be in model_config'
 
   checkpoint_path = tf.train.latest_checkpoint(pipeline_config.model_dir)
   meta_graph_def = read_meta_graph_file(checkpoint_path + '.meta')
 
   features_map = dict()
   for col_def in meta_graph_def.collection_def[
-    'variational_dropout'].bytes_list.value:
+      'variational_dropout'].bytes_list.value:
     features = json.loads(col_def)
     features_map.update(features)
 
@@ -108,10 +108,10 @@ def __init__(self,
   def compute_dropout_mask(self, n):
     delta_name = 'fscd_delta_%s' % self.name
     delta = tf.get_variable(
-      name=delta_name,
-      shape=[n],
-      dtype=tf.float32,
-      initializer=tf.constant_initializer(0.))
+        name=delta_name,
+        shape=[n],
+        dtype=tf.float32,
+        initializer=tf.constant_initializer(0.))
     delta = tf.nn.sigmoid(delta)
     epsilon = np.finfo(float).eps
     max_keep_ratio = self._config.max_keep_ratio
@@ -126,8 +126,9 @@ def compute_dropout_mask(self, n):
                                    dtype=tf.float32,
                                    seed=None,
                                    name='uniform_noise')
-    approx = (tf.log(delta) - tf.log(1. - delta) +
-              tf.log(unif_noise) - tf.log(1. - unif_noise))
+    approx = (
+        tf.log(delta) - tf.log(1. - delta) + tf.log(unif_noise) -
+        tf.log(1. - unif_noise))
     return tf.sigmoid(approx / self._config.temperature), delta
 
   def compute_regular_params(self, cols_to_feature):
@@ -147,14 +148,12 @@ def compute_regular_params(self, cols_to_feature):
       alpha = math.log(sig_c) - math.log(theta)
       alphas[fc] = alpha
       print(
-        str(fc.raw_name), 'complexity:', complexity, 'cardinality:', cardinal,
-        'dimension:', dim, 'c:', c, 'theta:', theta, 'alpha:', alpha)
+          str(fc.raw_name), 'complexity:', complexity, 'cardinality:', cardinal,
+          'dimension:', dim, 'c:', c, 'theta:', theta, 'alpha:', alpha)
     return alphas
 
   def __call__(self, cols_to_feature):
-    """
-    cols_to_feature: an ordered dict mapping feature_column to feature_values
-    """
+    """cols_to_feature: an ordered dict mapping feature_column to feature_values."""
     feature_dimension = []
     output_tensors = []
     alphas = []
diff --git a/easy_rec/python/layers/mask_net.py b/easy_rec/python/layers/mask_net.py
index fe4816fe8..fbe75c13c 100644
--- a/easy_rec/python/layers/mask_net.py
+++ b/easy_rec/python/layers/mask_net.py
@@ -10,17 +10,22 @@
 
 
 class MaskBlock(object):
-  def __init__(self, mask_block_config):
+
+  def __init__(self, mask_block_config, name='mask_block', reuse=None):
     self.mask_block_config = mask_block_config
+    self.name = name
+    self.reuse = reuse
 
   def __call__(self, net, mask_input):
     mask_input_dim = int(mask_input.shape[-1])
     if self.mask_block_config.HasField('reduction_factor'):
-      aggregation_size = int(mask_input_dim * self.mask_block_config.reduction_factor)
+      aggregation_size = int(mask_input_dim *
+                             self.mask_block_config.reduction_factor)
     elif self.mask_block_config.HasField('aggregation_size') is not None:
       aggregation_size = self.mask_block_config.aggregation_size
     else:
-      raise ValueError("Need one of reduction factor or aggregation size for MaskBlock.")
+      raise ValueError(
+          'Need one of reduction factor or aggregation size for MaskBlock.')
 
     if self.mask_block_config.input_layer_norm:
       input_name = net.name.replace(':', '_')
@@ -28,45 +33,66 @@ def __call__(self, net, mask_input):
 
     # initializer = tf.initializers.variance_scaling()
     initializer = tf.glorot_uniform_initializer()
-    mask = tf.layers.dense(mask_input, aggregation_size,
-                           activation=tf.nn.relu,
-                           kernel_initializer=initializer)
-    mask = tf.layers.dense(mask, net.shape[-1])
+    mask = tf.layers.dense(
+        mask_input,
+        aggregation_size,
+        activation=tf.nn.relu,
+        kernel_initializer=initializer,
+        name='%s/hidden' % self.name,
+        reuse=self.reuse)
+    mask = tf.layers.dense(
+        mask, net.shape[-1], name='%s/mask' % self.name, reuse=self.reuse)
     masked_net = net * mask
 
     output_size = self.mask_block_config.output_size
-    hidden_layer_output = tf.layers.dense(masked_net, output_size)
-    return layer_norm(hidden_layer_output)
+    hidden_layer_output = tf.layers.dense(
+        masked_net, output_size, name='%s/output' % self.name, reuse=self.reuse)
+    return layer_norm(
+        hidden_layer_output, name='%s/ln_output' % self.name, reuse=self.reuse)
 
 
 class MaskNet(object):
-  def __init__(self, mask_net_config, name='mask_net'):
+
+  def __init__(self, mask_net_config, name='mask_net', reuse=None):
     self.mask_net_config = mask_net_config
     self.name = name
+    self.reuse = reuse
 
   def __call__(self, inputs, is_training, l2_reg=None):
     conf = self.mask_net_config
     if conf.use_parallel:
       mask_outputs = []
-      for block_conf in self.mask_net_config.mask_blocks:
-        mask_layer = MaskBlock(block_conf)
+      for i, block_conf in enumerate(self.mask_net_config.mask_blocks):
+        mask_layer = MaskBlock(
+            block_conf, name='%s/block_%d' % (self.name, i), reuse=self.reuse)
         mask_outputs.append(mask_layer(mask_input=inputs, net=inputs))
       all_mask_outputs = tf.concat(mask_outputs, axis=1)
 
       if conf.HasField('mlp'):
-        mlp = dnn.DNN(conf.mlp, l2_reg, name='%s/mlp' % self.name, is_training=is_training)
+        mlp = dnn.DNN(
+            conf.mlp,
+            l2_reg,
+            name='%s/mlp' % self.name,
+            is_training=is_training,
+            reuse=self.reuse)
         output = mlp(all_mask_outputs)
       else:
         output = all_mask_outputs
       return output
     else:
       net = inputs
-      for block_conf in self.mask_net_config.mask_blocks:
-        mask_layer = MaskBlock(block_conf)
+      for i, block_conf in enumerate(self.mask_net_config.mask_blocks):
+        mask_layer = MaskBlock(
+            block_conf, name='%s/block_%d' % (self.name, i), reuse=self.reuse)
         net = mask_layer(net=net, mask_input=inputs)
 
       if conf.HasField('mlp'):
-        mlp = dnn.DNN(conf.mlp, l2_reg, name='%s/mlp' % self.name, is_training=is_training)
+        mlp = dnn.DNN(
+            conf.mlp,
+            l2_reg,
+            name='%s/mlp' % self.name,
+            is_training=is_training,
+            reuse=self.reuse)
         output = mlp(net)
       else:
         output = net
diff --git a/easy_rec/python/layers/numerical_embedding.py b/easy_rec/python/layers/numerical_embedding.py
new file mode 100644
index 000000000..420716254
--- /dev/null
+++ b/easy_rec/python/layers/numerical_embedding.py
@@ -0,0 +1,39 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+
+import tensorflow as tf
+from easy_rec.python.compat.array_ops import repeat
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class NumericalEmbedding(object):
+
+  def __init__(self, embedding_dim, scope='numerical_embedding', stddev=1.0):
+    self.embedding_dim = embedding_dim
+    self.scope = scope
+    self.initializer = tf.random_normal_initializer(stddev=stddev)
+
+  def __call__(self, inputs, *args, **kwargs):
+    if inputs.shape.ndims != 2:
+      raise ValueError('inputs of NumericalEmbedding must have 2 dimensions.')
+
+    num_features = int(inputs.shape[-1])
+    with tf.variable_scope(self.scope):
+      c = tf.get_variable(
+          'coef',
+          shape=[1, num_features * self.embedding_dim],
+          initializer=self.initializer)
+
+      features = repeat(inputs, self.embedding_dim, axis=1)
+      v = features * c * 2 * math.pi
+      sin_v = tf.split(tf.sin(v), num_features, axis=1)
+      cos_v = tf.split(tf.cos(v), num_features, axis=1)
+
+      embeddings = []
+      for val in zip(sin_v, cos_v):
+        embedding = tf.concat(val, axis=1)
+        embedding = tf.layers.dense(embedding, int(embedding.shape[-1]), activation=tf.nn.relu)
+        embeddings.append(embedding)
+      return tf.concat(embeddings, axis=1)
diff --git a/easy_rec/python/loss/info_nce_loss.py b/easy_rec/python/loss/info_nce_loss.py
new file mode 100644
index 000000000..3fd6b6b18
--- /dev/null
+++ b/easy_rec/python/loss/info_nce_loss.py
@@ -0,0 +1,41 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+def info_nce_loss(query, positive, temperature=0.1):
+  """Calculates the InfoNCE loss for self-supervised learning.
+
+  This contrastive loss enforces the embeddings of similar (positive) samples to be close
+      and those of different (negative) samples to be distant.
+  A query embedding is compared with one positive key and with one or more negative keys.
+
+  References:
+      https://arxiv.org/abs/1807.03748v2
+      https://arxiv.org/abs/2010.05113
+  """
+  # Check input dimensionality.
+  if query.shape.ndims != 2:
+    raise ValueError('<query> must have 2 dimensions.')
+  if positive.shape.ndims != 2:
+    raise ValueError('<positive> must have 2 dimensions.')
+  # Embedding vectors should have same number of components.
+  if query.shape[-1] != positive.shape[-1]:
+    raise ValueError(
+        'Vectors of <query> and <positive> should have the same number of components.'
+    )
+
+  # Negative keys are implicitly off-diagonal positive keys.
+
+  # Cosine between all combinations
+  logits = tf.matmul(query, positive, transpose_b=True)
+  logits /= temperature
+
+  # Positive keys are the entries on the diagonal
+  batch_size = tf.shape(query)[0]
+  labels = tf.range(batch_size)
+
+  return tf.losses.sparse_softmax_cross_entropy(labels, logits)
diff --git a/easy_rec/python/loss/jrc_loss.py b/easy_rec/python/loss/jrc_loss.py
index fc77bda86..778068e7e 100644
--- a/easy_rec/python/loss/jrc_loss.py
+++ b/easy_rec/python/loss/jrc_loss.py
@@ -12,8 +12,9 @@ def jrc_loss(labels,
              logits,
              session_ids,
              alpha=0.5,
-             auto_weight=False,
+             loss_weight_strategy='fixed',
              sample_weights=1.0,
+             same_label_loss=True,
              name=''):
   """Joint Optimization of Ranking and Calibration with Contextualized Hybrid Model.
 
@@ -24,14 +25,15 @@ def jrc_loss(labels,
     logits: a `Tensor` with shape [batch_size, 2]. e.g. the value of last neuron before activation.
     session_ids: a `Tensor` with shape [batch_size]. Session ids of each sample, used to max GAUC metric. e.g. user_id
     alpha: the weight to balance ranking loss and calibration loss
-    auto_weight: bool, whether to learn loss weight between ranking loss and calibration loss
+    loss_weight_strategy: str, the loss weight strategy to balancing between ce_loss and ge_loss
     sample_weights: Coefficients for the loss. This must be scalar or broadcastable to
       `labels` (i.e. same rank and each dimension is either 1 or the same).
+    same_label_loss: enable ge_loss for sample with same label in a session or not.
     name: the name of loss
   """
   loss_name = name if name else 'jrc_loss'
-  logging.info('[{}] alpha: {}, auto_weight: {}'.format(loss_name, alpha,
-                                                        auto_weight))
+  logging.info('[{}] alpha: {}, loss_weight_strategy: {}'.format(
+      loss_name, alpha, loss_weight_strategy))
 
   ce_loss = tf.losses.sparse_softmax_cross_entropy(
       labels, logits, weights=sample_weights)
@@ -66,12 +68,48 @@ def jrc_loss(labels,
     y_neg *= pairwise_weights
 
   # Compute list-wise generative loss -log p(x|y, z)
-  loss_pos = -tf.reduce_sum(y_pos * tf.nn.log_softmax(l_pos, axis=0), axis=0)
-  loss_neg = -tf.reduce_sum(y_neg * tf.nn.log_softmax(l_neg, axis=0), axis=0)
-  ge_loss = tf.reduce_mean((loss_pos + loss_neg) / tf.reduce_sum(mask, axis=0))
+  if same_label_loss:
+    logging.info('[%s] enable same_label_loss' % loss_name)
+    loss_pos = -tf.reduce_sum(y_pos * tf.nn.log_softmax(l_pos, axis=0), axis=0)
+    loss_neg = -tf.reduce_sum(y_neg * tf.nn.log_softmax(l_neg, axis=0), axis=0)
+    ge_loss = tf.reduce_mean(
+        (loss_pos + loss_neg) / tf.reduce_sum(mask, axis=0))
+  else:
+    logging.info('[%s] disable same_label_loss' % loss_name)
+    diag = tf.one_hot(tf.range(batch_size), batch_size)
+    l_pos = l_pos + (1 - diag) * y_pos * -1e9
+    l_neg = l_neg + (1 - diag) * y_neg * -1e9
+    loss_pos = -tf.linalg.diag_part(y_pos * tf.nn.log_softmax(l_pos, axis=0))
+    loss_neg = -tf.linalg.diag_part(y_neg * tf.nn.log_softmax(l_neg, axis=0))
+    ge_loss = tf.reduce_mean(loss_pos + loss_neg)
+
+  tf.summary.scalar('loss/%s_ce' % loss_name, ce_loss)
+  tf.summary.scalar('loss/%s_ge' % loss_name, ge_loss)
 
   # The final JRC model
-  if auto_weight:
+  if loss_weight_strategy == 'fixed':
+    loss = alpha * ce_loss + (1 - alpha) * ge_loss
+  elif loss_weight_strategy == 'random_uniform':
+    weight = tf.random_uniform([])
+    loss = weight * ce_loss + (1 - weight) * ge_loss
+    tf.summary.scalar('loss/%s_ce_weight' % loss_name, weight)
+    tf.summary.scalar('loss/%s_ge_weight' % loss_name, 1 - weight)
+  elif loss_weight_strategy == 'random_normal':
+    weights = tf.random_normal([2])
+    loss_weight = tf.nn.softmax(weights)
+    loss = loss_weight[0] * ce_loss + loss_weight[1] * ge_loss
+    tf.summary.scalar('loss/%s_ce_weight' % loss_name, loss_weight[0])
+    tf.summary.scalar('loss/%s_ge_weight' % loss_name, loss_weight[1])
+  elif loss_weight_strategy == 'random_bernoulli':
+    bern = tf.distributions.Bernoulli(probs=0.5, dtype=tf.float32)
+    weights = bern.sample(2)
+    loss_weight = tf.cond(
+        tf.equal(tf.reduce_sum(weights), 1), lambda: weights,
+        lambda: tf.convert_to_tensor([0.5, 0.5]))
+    loss = loss_weight[0] * ce_loss + loss_weight[1] * ge_loss
+    tf.summary.scalar('loss/%s_ce_weight' % loss_name, loss_weight[0])
+    tf.summary.scalar('loss/%s_ge_weight' % loss_name, loss_weight[1])
+  elif loss_weight_strategy == 'uncertainty':
     uncertainty1 = tf.Variable(
         0, name='%s_ranking_loss_weight' % loss_name, dtype=tf.float32)
     tf.summary.scalar('loss/%s_ranking_uncertainty' % loss_name, uncertainty1)
@@ -82,5 +120,6 @@ def jrc_loss(labels,
     loss = tf.exp(-uncertainty1) * ce_loss + 0.5 * uncertainty1
     loss += tf.exp(-uncertainty2) * ge_loss + 0.5 * uncertainty2
   else:
-    loss = alpha * ce_loss + (1 - alpha) * ge_loss
+    raise ValueError('Unsupported loss weight strategy `%s` for jrc loss' %
+                     loss_weight_strategy)
   return loss
diff --git a/easy_rec/python/model/dbmtl.py b/easy_rec/python/model/dbmtl.py
index e829ba57f..a1ebbf14b 100644
--- a/easy_rec/python/model/dbmtl.py
+++ b/easy_rec/python/model/dbmtl.py
@@ -6,8 +6,6 @@
 from easy_rec.python.layers import dnn
 from easy_rec.python.layers import mmoe
 from easy_rec.python.layers import uniter
-from easy_rec.python.layers import fibinet
-from easy_rec.python.layers import mask_net
 from easy_rec.python.model.multi_task_model import MultiTaskModel
 from easy_rec.python.protos.dbmtl_pb2 import DBMTL as DBMTLConfig
 
@@ -39,54 +37,56 @@ def __init__(self,
                                          features,
                                          self._model_config.bottom_uniter,
                                          self._input_layer)
-    elif self._model_config.HasField('bottom_fibinet'):
-      self._fibinet_layer = fibinet.FiBiNetLayer(self._model_config.bottom_fibinet,
-                                                 features,
-                                                 self._input_layer)
-    elif self._model_config.HasField('bottom_mask_net'):
-      self._mask_net_layer = mask_net.MaskNet(self._model_config.bottom_mask_net)
-      self._features, _ = self._input_layer(self._feature_dict, 'all')
-    else:
-      self._features, _ = self._input_layer(self._feature_dict, 'all')
+    elif not self.has_backbone:
+      self._features, self._feature_list = self._input_layer(
+          self._feature_dict, 'all')
     self._init_towers(self._model_config.task_towers)
 
   def build_predict_graph(self):
-    if self._model_config.use_input_batch_norm:
-      self._features = tf.layers.batch_normalization(
-          self._features,
-          training=self._is_training,
-          trainable=True,
-          name='input_bn')
-    if self._model_config.HasField('input_dropout_rate'):
-      drop_rate = self._model_config.input_dropout_rate
-      self._features = tf.layers.dropout(
-          self._features,
-          rate=drop_rate,
-          training=self._is_training,
-          name='input_dropout')
-
-    if self._model_config.HasField('bottom_cmbf'):
-      bottom_fea = self._cmbf_layer(self._is_training, l2_reg=self._l2_reg)
-    elif self._model_config.HasField('bottom_uniter'):
-      bottom_fea = self._uniter_layer(self._is_training, l2_reg=self._l2_reg)
-    elif self._model_config.HasField('bottom_fibinet'):
-      bottom_fea = self._fibinet_layer('all', self._is_training, l2_reg=self._l2_reg)
-    elif self._model_config.HasField('bottom_mask_net'):
-      bottom_fea = self._mask_net_layer(self._features, self._is_training, l2_reg=self._l2_reg)
-    elif self._model_config.HasField('bottom_dnn'):
-      bottom_dnn = dnn.DNN(
-          self._model_config.bottom_dnn,
-          self._l2_reg,
-          name='bottom_dnn',
-          is_training=self._is_training)
-      bottom_fea = bottom_dnn(self._features)
-    else:
-      bottom_fea = self._features
+    # if self._model_config.use_self_supervised_learning:
+    #   bern = tf.distributions.Bernoulli(probs=0.5)
+    #   num_features = len(self._feature_list)
+    #   mask = bern.sample(num_features)
+    #   left_features, right_features = [], []
+    #   for i in range(num_features):
+    #     fea = self._feature_list[i]
+    #     zero = tf.zeros_like(fea)
+    #     left, right = tf.cond(
+    #         tf.equal(mask[i], 1), lambda: (fea, zero), lambda: (zero, fea))
+    #     left_features.append(left)
+    #     right_features.append(right)
+    #   left_feature = tf.concat(left_features, axis=-1)
+    #   right_feature = tf.concat(right_features, axis=-1)
+    #   if self._model_config.HasField('bottom_mask_net'):
+    #     left_encoding = self._mask_net_layer(
+    #         left_feature, self._is_training, l2_reg=self._l2_reg)
+    #     right_encoding = self._mask_net_layer(
+    #         right_feature, self._is_training, l2_reg=self._l2_reg)
+    #   else:
+    #     raise ValueError(
+    #         'Unsupported bottom layer when use self supervised learning')
+    #
+    #   loss = info_nce_loss(
+    #       left_encoding,
+    #       right_encoding,
+    #       temperature=self._model_config.ssl_loss_temperature)
+    #   self._loss_dict['ssl_loss'] = loss * self._model_config.ssl_loss_weight
 
-    if self._model_config.use_sequence_encoder:
-      seq_encoding = self.get_sequence_encoding(is_training=self._is_training)
-      if seq_encoding is not None:
-        bottom_fea = tf.concat([bottom_fea, seq_encoding], axis=-1)
+    bottom_fea = self.backbone
+    if bottom_fea is None:
+      if self._model_config.HasField('bottom_cmbf'):
+        bottom_fea = self._cmbf_layer(self._is_training, l2_reg=self._l2_reg)
+      elif self._model_config.HasField('bottom_uniter'):
+        bottom_fea = self._uniter_layer(self._is_training, l2_reg=self._l2_reg)
+      elif self._model_config.HasField('bottom_dnn'):
+        bottom_dnn = dnn.DNN(
+            self._model_config.bottom_dnn,
+            self._l2_reg,
+            name='bottom_dnn',
+            is_training=self._is_training)
+        bottom_fea = bottom_dnn(self._features)
+      else:
+        bottom_fea = self._features
 
     # MMOE block
     if self._model_config.HasField('expert_dnn'):
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index 4a7ad6330..c6d864498 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -11,7 +11,7 @@
 from tensorflow.python.ops.variables import PartitionedVariable
 
 from easy_rec.python.compat import regularizers
-from easy_rec.python.layers import dnn
+from easy_rec.python.layers.backbone import Backbone
 from easy_rec.python.layers import input_layer
 from easy_rec.python.layers.sequence_encoder import SequenceEncoder
 from easy_rec.python.utils import constant
@@ -66,6 +66,22 @@ def __init__(self,
                                              model_config.feature_groups,
                                              self._l2_reg)
     self._sequence_encoding_by_group_name = {}
+    if model_config.HasField('backbone'):
+      self._backbone = Backbone(model_config.backbone, self, features,
+                                input_layer=self._input_layer,
+                                l2_reg=self._l2_reg)
+    else:
+      self._backbone = None
+
+  @property
+  def has_backbone(self):
+    return self._base_model_config.HasField('backbone')
+
+  @property
+  def backbone(self):
+    if self._backbone:
+      return self._backbone(self._is_training)
+    return None
 
   @property
   def embedding_regularization(self):
@@ -104,8 +120,7 @@ def build_input_layer(self, model_config, feature_configs):
         kernel_regularizer=self._l2_reg,
         variational_dropout_config=model_config.variational_dropout
         if model_config.HasField('variational_dropout') else None,
-        is_training=self._is_training,
-        do_feature_normalize=model_config.do_feature_normalize)
+        is_training=self._is_training)
 
   def get_sequence_encoding(self, group_name=None, is_training=True):
     if group_name is not None:
@@ -143,13 +158,13 @@ def get_sequence_encoding(self, group_name=None, is_training=True):
     else:
       return None
 
-    if self._base_model_config.HasField('sequence_dnn'):
-      sequence_dnn = dnn.DNN(
-          self._base_model_config.sequence_dnn,
-          self._l2_reg,
-          name='sequence_dnn',
-          is_training=self._is_training)
-      encoding = sequence_dnn(encoding)
+    # if self._base_model_config.HasField('sequence_dnn'):
+    #   sequence_dnn = dnn.DNN(
+    #       self._base_model_config.sequence_dnn,
+    #       self._l2_reg,
+    #       name='sequence_dnn',
+    #       is_training=self._is_training)
+    #   encoding = sequence_dnn(encoding)
     return encoding
 
   @abstractmethod
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
new file mode 100644
index 000000000..7b128afe4
--- /dev/null
+++ b/easy_rec/python/protos/backbone.proto
@@ -0,0 +1,44 @@
+syntax = "proto2";
+package protos;
+
+import "easy_rec/python/protos/dnn.proto";
+import "easy_rec/python/protos/fibinet.proto";
+import "easy_rec/python/protos/masknet.proto";
+
+message NumericalEmbedding {
+    required uint32 embedding_dim = 1;
+    required float  coef_stddev = 2 [default = 1.0];
+}
+
+message SequenceLayer {
+    optional DNN mlp = 1;
+}
+
+message InputLayer {
+    optional bool do_batch_norm = 1;
+    optional bool do_layer_norm = 2;
+    optional float dropout_rate = 3;
+    optional float feature_dropout_rate = 4;
+    optional bool output_feature_list = 5;
+}
+
+message Block {
+    required string name = 1;
+    // the input names of feature groups or other blocks
+    repeated string inputs = 2;
+    oneof layer {
+        InputLayer input_layer = 100;
+        NumericalEmbedding numerical_embedding = 101;
+        SequenceLayer sequence_encoder = 102;
+        MaskNet masknet = 103;
+        SENet senet = 104;
+        FiBiNetTower fibinet = 105;
+        DNN mlp = 106;
+    }
+}
+
+message BackboneTower {
+    repeated Block blocks = 1;
+    repeated string concat_blocks = 2;
+    optional DNN top_mlp = 3;
+}
\ No newline at end of file
diff --git a/easy_rec/python/protos/cmbf.proto b/easy_rec/python/protos/cmbf.proto
index 598bf1ecf..34e082115 100644
--- a/easy_rec/python/protos/cmbf.proto
+++ b/easy_rec/python/protos/cmbf.proto
@@ -1,9 +1,50 @@
 syntax = "proto2";
 package protos;
 
-import "easy_rec/python/protos/layer.proto";
 import "easy_rec/python/protos/dnn.proto";
 
+message CMBFTower {
+    // The number of heads of cross modal fusion layer
+    required uint32 multi_head_num = 1 [default = 1];
+    // The number of heads of image feature learning layer
+    required uint32 image_multi_head_num = 101 [default = 1];
+    // The number of heads of text feature learning layer
+    required uint32 text_multi_head_num = 102 [default = 1];
+    // The dimension of text heads
+    required uint32 text_head_size = 2;
+    // The dimension of image heads
+    required uint32 image_head_size = 3 [default = 64];
+    // The number of patches of image feature, take effect when there is only one image feature
+    required uint32 image_feature_patch_num = 4 [default = 1];
+    // Do dimension reduce to this size for image feature before single modal learning module
+    required uint32 image_feature_dim = 5 [default = 0];
+    // The number of self attention layers for image features
+    required uint32 image_self_attention_layer_num = 6 [default = 0];
+    // The number of self attention layers for text features
+    required uint32 text_self_attention_layer_num = 7 [default = 1];
+    // The number of cross modal layers
+    required uint32 cross_modal_layer_num = 8 [default = 1];
+    // The dimension of image cross modal heads
+    required uint32 image_cross_head_size = 9;
+    // The dimension of text cross modal heads
+    required uint32 text_cross_head_size = 10;
+    // Dropout probability for hidden layers
+    required float hidden_dropout_prob = 11 [default = 0.0];
+    // Dropout probability of the attention probabilities
+    required float attention_probs_dropout_prob = 12 [default = 0.0];
+
+    // Whether to add embeddings for different text sequence features
+    required bool use_token_type = 13 [default = false];
+    // Whether to add position embeddings for the position of each token in the text sequence
+    required bool use_position_embeddings = 14 [default = true];
+    // Maximum sequence length that might ever be used with this model
+    required uint32 max_position_embeddings = 15 [default = 0];
+    // Dropout probability for text sequence embeddings
+    required float text_seq_emb_dropout_prob = 16 [default = 0.1];
+    // dnn layers for other features
+    optional DNN other_feature_dnn = 17;
+}
+
 message CMBF {
     required CMBFTower config = 1;
 
diff --git a/easy_rec/python/protos/dbmtl.proto b/easy_rec/python/protos/dbmtl.proto
index 5c7152ee1..a9c4a2e74 100644
--- a/easy_rec/python/protos/dbmtl.proto
+++ b/easy_rec/python/protos/dbmtl.proto
@@ -3,19 +3,14 @@ package protos;
 
 import "easy_rec/python/protos/dnn.proto";
 import "easy_rec/python/protos/tower.proto";
-import "easy_rec/python/protos/layer.proto";
-import "easy_rec/python/protos/fibinet.proto";
-import "easy_rec/python/protos/masknet.proto";
+import "easy_rec/python/protos/cmbf.proto";
+import "easy_rec/python/protos/uniter.proto";
 
 message DBMTL {
     // shared bottom cmbf layer
     optional CMBFTower bottom_cmbf = 101;
     // shared bottom uniter layer
     optional UniterTower bottom_uniter = 102;
-    // shared bottom fibinet layer
-    optional FiBiNetTower bottom_fibinet = 103;
-    // shared bottom mask net layer
-    optional MaskNet bottom_mask_net = 104;
     // shared bottom dnn layer
     optional DNN bottom_dnn = 1;
     // mmoe expert dnn layer definition
@@ -26,10 +21,9 @@ message DBMTL {
     repeated BayesTaskTower task_towers = 4;
     // l2 regularization
     optional float l2_regularization = 5 [default = 1e-4];
-    // Whether to user sequence encoder
-    required bool use_sequence_encoder = 6 [default = false];
-    // Whether to user sequence encoder
-    required bool use_input_batch_norm = 7 [default = false];
-    // input layer dropout rate
-    optional float input_dropout_rate = 8 [default = 0];
+
+    // Whether to use self supervised learning
+    required bool use_self_supervised_learning = 9 [default = false];
+    optional float ssl_loss_weight = 10 [default = 1.0];
+    optional float ssl_loss_temperature = 11 [default = 0.1];
 }
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index f28180e10..faa78a0bf 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -1,9 +1,9 @@
 syntax = "proto2";
 package protos;
 
+import "easy_rec/python/protos/backbone.proto";
 import "easy_rec/python/protos/fm.proto";
 import "easy_rec/python/protos/deepfm.proto";
-import "easy_rec/python/protos/dnn.proto";
 import "easy_rec/python/protos/wide_and_deep.proto";
 import "easy_rec/python/protos/multi_tower.proto";
 import "easy_rec/python/protos/dlrm.proto";
@@ -110,8 +110,7 @@ message EasyRecModel {
     }
     required LossWeightStrategy loss_weight_strategy = 16 [default = Fixed];
 
-    // dnn layers after sequence feature
-    optional DNN sequence_dnn = 17;
+    optional BackboneTower backbone = 17;
 
-    optional bool do_feature_normalize = 18;
+    // optional bool do_feature_normalize = 18;
 }
diff --git a/easy_rec/python/protos/fibinet.proto b/easy_rec/python/protos/fibinet.proto
index b13fd7cba..124bebfe4 100644
--- a/easy_rec/python/protos/fibinet.proto
+++ b/easy_rec/python/protos/fibinet.proto
@@ -3,13 +3,21 @@ package protos;
 
 import "easy_rec/python/protos/dnn.proto";
 
-message FiBiNetTower {
-    required string bilinear_type = 1 [default = 'interaction'];
-    required bool use_bilinear_plus = 2 [default = true];
-    required uint32 bilinear_output_units = 3;
+message SENet {
+    required uint32 reduction_ratio = 1 [default = 4];
+    optional uint32 num_squeeze_group = 2 [default = 2];
+    optional bool use_skip_connection = 3 [default = true];
+    optional bool use_output_layer_norm = 4 [default = true];
+}
 
-    required uint32 senet_reduction_ratio = 4 [default = 3];
-    optional uint32 num_senet_squeeze_group = 5 [default = 2];
+message Bilinear {
+    required string type = 1 [default = 'interaction'];
+    required bool use_plus = 2 [default = true];
+    required uint32 output_units = 3;
+}
 
-    required DNN mlp = 6;
+message FiBiNetTower {
+    optional Bilinear bilinear = 1;
+    required SENet senet = 2;
+    optional DNN mlp = 8;
 }
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index 4ddacac5e..b2ac0d789 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -8,72 +8,7 @@ message HighWayTower {
     required uint32 emb_size = 2;
 }
 
-message CMBFTower {
-    // The number of heads of cross modal fusion layer
-    required uint32 multi_head_num = 1 [default = 1];
-    // The number of heads of image feature learning layer
-    required uint32 image_multi_head_num = 101 [default = 1];
-    // The number of heads of text feature learning layer
-    required uint32 text_multi_head_num = 102 [default = 1];
-    // The dimension of text heads
-    required uint32 text_head_size = 2;
-    // The dimension of image heads
-    required uint32 image_head_size = 3 [default = 64];
-    // The number of patches of image feature, take effect when there is only one image feature
-    required uint32 image_feature_patch_num = 4 [default = 1];
-    // Do dimension reduce to this size for image feature before single modal learning module
-    required uint32 image_feature_dim = 5 [default = 0];
-    // The number of self attention layers for image features
-    required uint32 image_self_attention_layer_num = 6 [default = 0];
-    // The number of self attention layers for text features
-    required uint32 text_self_attention_layer_num = 7 [default = 1];
-    // The number of cross modal layers
-    required uint32 cross_modal_layer_num = 8 [default = 1];
-    // The dimension of image cross modal heads
-    required uint32 image_cross_head_size = 9;
-    // The dimension of text cross modal heads
-    required uint32 text_cross_head_size = 10;
-    // Dropout probability for hidden layers
-    required float hidden_dropout_prob = 11 [default = 0.0];
-    // Dropout probability of the attention probabilities
-    required float attention_probs_dropout_prob = 12 [default = 0.0];
 
-    // Whether to add embeddings for different text sequence features
-    required bool use_token_type = 13 [default = false];
-    // Whether to add position embeddings for the position of each token in the text sequence
-    required bool use_position_embeddings = 14 [default = true];
-    // Maximum sequence length that might ever be used with this model
-    required uint32 max_position_embeddings = 15 [default = 0];
-    // Dropout probability for text sequence embeddings
-    required float text_seq_emb_dropout_prob = 16 [default = 0.1];
-    // dnn layers for other features
-    optional DNN other_feature_dnn = 17;
-}
-
-message UniterTower {
-    // Size of the encoder layers and the pooler layer
-    required uint32 hidden_size = 1;
-    // Number of hidden layers in the Transformer encoder
-    required uint32 num_hidden_layers = 2;
-    // Number of attention heads for each attention layer in the Transformer encoder
-    required uint32 num_attention_heads = 3;
-    // The size of the "intermediate" (i.e. feed-forward) layer in the Transformer encoder
-    required uint32 intermediate_size = 4;
-    // The non-linear activation function (function or string) in the encoder and pooler.
-    required string hidden_act = 5 [default = 'gelu'];  // "gelu", "relu", "tanh" and "swish" are supported.
-    // The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
-    required float hidden_dropout_prob = 6 [default = 0.1];
-    // The dropout ratio for the attention probabilities
-    required float attention_probs_dropout_prob = 7 [default = 0.1];
-    // The maximum sequence length that this model might ever be used with
-    required uint32 max_position_embeddings = 8 [default = 512];
-    // Whether to add position embeddings for the position of each token in the text sequence
-    required bool use_position_embeddings = 9 [default = true];
-    // The stddev of the truncated_normal_initializer for initializing all weight matrices
-    required float initializer_range = 10 [default = 0.02];
-    // dnn layers for other features
-    optional DNN other_feature_dnn = 11;
-}
 
 message SequenceEncoder {
     // encoder parameters
diff --git a/easy_rec/python/protos/loss.proto b/easy_rec/python/protos/loss.proto
index c5b74f47d..5c913bf6e 100644
--- a/easy_rec/python/protos/loss.proto
+++ b/easy_rec/python/protos/loss.proto
@@ -93,4 +93,6 @@ message PairwiseLogisticLoss {
 message JRCLoss {
   required string session_name = 1;
   optional float alpha = 2 [default = 0.5];
+  optional bool same_label_loss = 3 [default = true];
+  required string loss_weight_strategy = 4 [default = 'fixed'];
 }
diff --git a/easy_rec/python/protos/masknet.proto b/easy_rec/python/protos/masknet.proto
index c9b0b703a..3feba334e 100644
--- a/easy_rec/python/protos/masknet.proto
+++ b/easy_rec/python/protos/masknet.proto
@@ -14,4 +14,4 @@ message MaskNet {
     repeated MaskBlock mask_blocks = 1;
     required bool use_parallel = 2 [default = true];
     optional DNN mlp = 3;
-}
\ No newline at end of file
+}
diff --git a/easy_rec/python/protos/uniter.proto b/easy_rec/python/protos/uniter.proto
index 7e78ad23e..9efc1dc9e 100644
--- a/easy_rec/python/protos/uniter.proto
+++ b/easy_rec/python/protos/uniter.proto
@@ -1,9 +1,33 @@
 syntax = "proto2";
 package protos;
 
-import "easy_rec/python/protos/layer.proto";
 import "easy_rec/python/protos/dnn.proto";
 
+message UniterTower {
+    // Size of the encoder layers and the pooler layer
+    required uint32 hidden_size = 1;
+    // Number of hidden layers in the Transformer encoder
+    required uint32 num_hidden_layers = 2;
+    // Number of attention heads for each attention layer in the Transformer encoder
+    required uint32 num_attention_heads = 3;
+    // The size of the "intermediate" (i.e. feed-forward) layer in the Transformer encoder
+    required uint32 intermediate_size = 4;
+    // The non-linear activation function (function or string) in the encoder and pooler.
+    required string hidden_act = 5 [default = 'gelu'];  // "gelu", "relu", "tanh" and "swish" are supported.
+    // The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
+    required float hidden_dropout_prob = 6 [default = 0.1];
+    // The dropout ratio for the attention probabilities
+    required float attention_probs_dropout_prob = 7 [default = 0.1];
+    // The maximum sequence length that this model might ever be used with
+    required uint32 max_position_embeddings = 8 [default = 512];
+    // Whether to add position embeddings for the position of each token in the text sequence
+    required bool use_position_embeddings = 9 [default = true];
+    // The stddev of the truncated_normal_initializer for initializing all weight matrices
+    required float initializer_range = 10 [default = 0.02];
+    // dnn layers for other features
+    optional DNN other_feature_dnn = 11;
+}
+
 message Uniter {
     required UniterTower config = 1;
 
diff --git a/easy_rec/python/utils/dag.py b/easy_rec/python/utils/dag.py
new file mode 100644
index 000000000..5063c8473
--- /dev/null
+++ b/easy_rec/python/utils/dag.py
@@ -0,0 +1,205 @@
+from collections import OrderedDict, defaultdict
+from copy import copy, deepcopy
+
+
+class DAG(object):
+    """ Directed acyclic graph implementation. """
+
+    def __init__(self):
+        """ Construct a new DAG with no nodes or edges. """
+        self.reset_graph()
+
+    def add_node(self, node_name, graph=None):
+        """ Add a node if it does not exist yet, or error out. """
+        if not graph:
+            graph = self.graph
+        if node_name in graph:
+            raise KeyError('node %s already exists' % node_name)
+        graph[node_name] = set()
+
+    def add_node_if_not_exists(self, node_name, graph=None):
+        try:
+            self.add_node(node_name, graph=graph)
+        except KeyError:
+            pass
+
+    def delete_node(self, node_name, graph=None):
+        """ Deletes this node and all edges referencing it. """
+        if not graph:
+            graph = self.graph
+        if node_name not in graph:
+            raise KeyError('node %s does not exist' % node_name)
+        graph.pop(node_name)
+
+        for node, edges in graph.items():
+            if node_name in edges:
+                edges.remove(node_name)
+
+    def delete_node_if_exists(self, node_name, graph=None):
+        try:
+            self.delete_node(node_name, graph=graph)
+        except KeyError:
+            pass
+
+    def add_edge(self, ind_node, dep_node, graph=None):
+        """ Add an edge (dependency) between the specified nodes. """
+        if not graph:
+            graph = self.graph
+        if ind_node not in graph or dep_node not in graph:
+            raise KeyError('one or more nodes do not exist in graph')
+        test_graph = deepcopy(graph)
+        test_graph[ind_node].add(dep_node)
+        is_valid, message = self.validate(test_graph)
+        if is_valid:
+            graph[ind_node].add(dep_node)
+        else:
+            raise Exception()
+
+    def delete_edge(self, ind_node, dep_node, graph=None):
+        """ Delete an edge from the graph. """
+        if not graph:
+            graph = self.graph
+        if dep_node not in graph.get(ind_node, []):
+            raise KeyError('this edge does not exist in graph')
+        graph[ind_node].remove(dep_node)
+
+    def rename_edges(self, old_task_name, new_task_name, graph=None):
+        """ Change references to a task in existing edges. """
+        if not graph:
+            graph = self.graph
+        for node, edges in graph.items():
+
+            if node == old_task_name:
+                graph[new_task_name] = copy(edges)
+                del graph[old_task_name]
+
+            else:
+                if old_task_name in edges:
+                    edges.remove(old_task_name)
+                    edges.add(new_task_name)
+
+    def predecessors(self, node, graph=None):
+        """ Returns a list of all predecessors of the given node """
+        if graph is None:
+            graph = self.graph
+        return [key for key in graph if node in graph[key]]
+
+    def downstream(self, node, graph=None):
+        """ Returns a list of all nodes this node has edges towards. """
+        if graph is None:
+            graph = self.graph
+        if node not in graph:
+            raise KeyError('node %s is not in graph' % node)
+        return list(graph[node])
+
+    def all_downstreams(self, node, graph=None):
+        """Returns a list of all nodes ultimately downstream
+        of the given node in the dependency graph, in
+        topological order."""
+        if graph is None:
+            graph = self.graph
+        nodes = [node]
+        nodes_seen = set()
+        i = 0
+        while i < len(nodes):
+            downstreams = self.downstream(nodes[i], graph)
+            for downstream_node in downstreams:
+                if downstream_node not in nodes_seen:
+                    nodes_seen.add(downstream_node)
+                    nodes.append(downstream_node)
+            i += 1
+        return list(
+            filter(
+                lambda node: node in nodes_seen,
+                self.topological_sort(graph=graph)
+            )
+        )
+
+    def all_leaves(self, graph=None):
+        """ Return a list of all leaves (nodes with no downstreams) """
+        if graph is None:
+            graph = self.graph
+        return [key for key in graph if not graph[key]]
+
+    def from_dict(self, graph_dict):
+        """ Reset the graph and build it from the passed dictionary.
+        The dictionary takes the form of {node_name: [directed edges]}
+        """
+
+        self.reset_graph()
+        for new_node in graph_dict.keys():
+            self.add_node(new_node)
+        for ind_node, dep_nodes in graph_dict.items():
+            if not isinstance(dep_nodes, list):
+                raise TypeError('dict values must be lists')
+            for dep_node in dep_nodes:
+                self.add_edge(ind_node, dep_node)
+
+    def reset_graph(self):
+        """ Restore the graph to an empty state. """
+        self.graph = OrderedDict()
+
+    def ind_nodes(self, graph=None):
+        """ Returns a list of all nodes in the graph with no dependencies. """
+        if graph is None:
+            graph = self.graph
+
+        dependent_nodes = set(
+            node for dependents in graph.values() for node in dependents
+        )
+        return [node for node in graph.keys() if node not in dependent_nodes]
+
+    def validate(self, graph=None):
+        """ Returns (Boolean, message) of whether DAG is valid. """
+        graph = graph if graph is not None else self.graph
+        if len(self.ind_nodes(graph)) == 0:
+            return False, 'no independent nodes detected'
+        try:
+            self.topological_sort(graph)
+        except ValueError:
+            return False, 'failed topological sort'
+        return True, 'valid'
+
+    def topological_sort(self, graph=None):
+        """ Returns a topological ordering of the DAG.
+        Raises an error if this is not possible (graph is not valid).
+        """
+        if graph is None:
+            graph = self.graph
+        result = []
+        in_degree = defaultdict(lambda: 0)
+
+        for u in graph:
+            for v in graph[u]:
+                in_degree[v] += 1
+        ready = [node for node in graph if not in_degree[node]]
+
+        while ready:
+            u = ready.pop()
+            result.append(u)
+            for v in graph[u]:
+                in_degree[v] -= 1
+                if in_degree[v] == 0:
+                    ready.append(v)
+
+        if len(result) == len(graph):
+            return result
+        else:
+            raise ValueError('graph is not acyclic')
+
+    def size(self):
+        return len(self.graph)
+
+
+if __name__ == '__main__':
+    dag = DAG()
+    dag.add_node("a")
+    dag.add_node("b")
+    dag.add_node("c")
+    dag.add_node("d")
+    dag.add_edge("a", "b")
+    dag.add_edge("a", "d")
+    dag.add_edge("b", "c")
+    print(dag.topological_sort())
+    print(dag.graph)
+    print(dag.all_downstreams("b"))
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index f99acc17b..cd2b0ac0c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ multi_line_output = 7
 force_single_line = true
 known_standard_library = setuptools
 known_first_party = easy_rec
-known_third_party = absl,common_io,distutils,docutils,eas_prediction,easyrec_request,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
+known_third_party = absl,common_io,docutils,eas_prediction,future,google,graphlearn,kafka,matplotlib,numpy,oss2,pai,pandas,psutil,six,skimage,sklearn,sphinx_markdown_tables,sphinx_rtd_theme,tensorflow,yaml
 no_lines_before = LOCALFOLDER
 default_section = THIRDPARTY
 skip = easy_rec/python/protos

From 5a47eb822c523dfcfb1c6e4d40cb9b1e2564b914 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 12 Jun 2023 14:24:00 +0800
Subject: [PATCH 29/54] [feat]: add backbone network

---
 easy_rec/python/layers/backbone.py            | 27 ++++++--
 easy_rec/python/layers/common_layers.py       |  5 +-
 easy_rec/python/layers/mask_net.py            | 12 ++--
 easy_rec/python/layers/numerical_embedding.py | 49 ++++++++++++--
 easy_rec/python/model/rank_model.py           |  9 +++
 easy_rec/python/protos/backbone.proto         | 29 +++-----
 easy_rec/python/protos/easy_rec_model.proto   |  3 +-
 easy_rec/python/protos/feature_config.proto   |  2 +-
 easy_rec/python/protos/layer.proto            | 67 ++++++-------------
 easy_rec/python/protos/seq_encoder.proto      | 53 +++++++++++++++
 10 files changed, 172 insertions(+), 84 deletions(-)
 create mode 100644 easy_rec/python/protos/seq_encoder.proto

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 285ff80c5..3e95ba709 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -6,8 +6,8 @@
 
 from easy_rec.python.utils.dag import DAG
 from easy_rec.python.layers import dnn
-from easy_rec.python.layers.common_layers import layer_norm, SENet
-from easy_rec.python.layers.numerical_embedding import NumericalEmbedding
+from easy_rec.python.layers.common_layers import layer_norm, SENet, highway
+from easy_rec.python.layers.numerical_embedding import PeriodicEmbedding, AutoDisEmbedding
 from easy_rec.python.layers.fibinet import FiBiNetLayer
 from easy_rec.python.layers.mask_net import MaskNet
 
@@ -97,6 +97,7 @@ def __call__(self, is_training, *args, **kwargs):
     block_outputs = {}
     blocks = self._dag.topological_sort()
     logging.info("backbone topological: " + ','.join(blocks))
+    print("backbone topological: " + ','.join(blocks))
     for block in blocks:
       config = self._name_to_blocks[block]
       layer = config.WhichOneof('layer')
@@ -106,12 +107,26 @@ def __call__(self, is_training, *args, **kwargs):
         input_layer = EnhancedInputLayer(conf, self._input_layer, self._features)
         output = input_layer(config.inputs[0], is_training)
         block_outputs[block] = output
-      elif layer == 'numerical_embedding':
-        conf = config.numerical_embedding
-        num_emb = NumericalEmbedding(conf.embedding_dim, stddev=conf.coef_stddev,
-                                     scope='%s_numerical_embedding' % block)
+      elif layer == 'periodic_embedding':
+        conf = config.periodic_embedding
+        num_emb = PeriodicEmbedding(conf.embedding_dim, stddev=conf.coef_stddev, scope=block)
         input_feature = self.block_input(config, block_outputs)
         block_outputs[block] = num_emb(input_feature)
+      elif layer == 'auto_dis_embedding':
+        conf = config.auto_dis_embedding
+        num_emb = AutoDisEmbedding(conf, scope=block)
+        input_feature = self.block_input(config, block_outputs)
+        block_outputs[block] = num_emb(input_feature)
+      elif layer == 'highway':
+        conf = config.highway
+        input_feature = self.block_input(config, block_outputs)
+        highway_fea = highway(
+          input_feature,
+          conf.emb_size,
+          activation=conf.activation,
+          dropout=conf.dropout_rate,
+          scope=block)
+        block_outputs[block] = highway_fea(input_feature)
       elif layer == 'mlp':
         mlp = dnn.DNN(
           config.mlp,
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index 892e75550..be4615699 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -2,10 +2,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import itertools
 import logging
-
+import six
 import tensorflow as tf
 
 from easy_rec.python.compat.layers import layer_norm as tf_layer_norm
+from easy_rec.python.utils.activation import get_activation
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -18,6 +19,8 @@ def highway(x,
             scope='highway',
             dropout=0.0,
             reuse=None):
+  if isinstance(activation, six.string_types):
+    activation = get_activation(activation)
   with tf.variable_scope(scope, reuse):
     if size is None:
       size = x.shape.as_list()[-1]
diff --git a/easy_rec/python/layers/mask_net.py b/easy_rec/python/layers/mask_net.py
index fbe75c13c..034cd6018 100644
--- a/easy_rec/python/layers/mask_net.py
+++ b/easy_rec/python/layers/mask_net.py
@@ -45,15 +45,19 @@ def __call__(self, net, mask_input):
     masked_net = net * mask
 
     output_size = self.mask_block_config.output_size
-    hidden_layer_output = tf.layers.dense(
-        masked_net, output_size, name='%s/output' % self.name, reuse=self.reuse)
-    return layer_norm(
-        hidden_layer_output, name='%s/ln_output' % self.name, reuse=self.reuse)
+    hidden = tf.layers.dense(
+        masked_net, output_size, use_bias=False, name='%s/output' % self.name, reuse=self.reuse)
+    ln_hidden = layer_norm(hidden, name='%s/ln_output' % self.name, reuse=self.reuse)
+    return tf.nn.relu(ln_hidden)
 
 
 class MaskNet(object):
 
   def __init__(self, mask_net_config, name='mask_net', reuse=None):
+    """MaskNet: Introducing Feature-Wise Multiplication to CTR Ranking Models by Instance-Guided Mask.
+
+    Refer: https://arxiv.org/pdf/2102.07619.pdf
+    """
     self.mask_net_config = mask_net_config
     self.name = name
     self.reuse = reuse
diff --git a/easy_rec/python/layers/numerical_embedding.py b/easy_rec/python/layers/numerical_embedding.py
index 420716254..26e9f63a3 100644
--- a/easy_rec/python/layers/numerical_embedding.py
+++ b/easy_rec/python/layers/numerical_embedding.py
@@ -8,16 +8,20 @@
   tf = tf.compat.v1
 
 
-class NumericalEmbedding(object):
+class PeriodicEmbedding(object):
 
-  def __init__(self, embedding_dim, scope='numerical_embedding', stddev=1.0):
-    self.embedding_dim = embedding_dim
+  def __init__(self, embedding_dim, scope='periodic_embedding', stddev=1.0):
+    """On Embeddings for Numerical Features in Tabular Deep Learning.
+
+    Refer: https://arxiv.org/pdf/2203.05556.pdf
+    """
+    self.embedding_dim = embedding_dim // 2
     self.scope = scope
     self.initializer = tf.random_normal_initializer(stddev=stddev)
 
   def __call__(self, inputs, *args, **kwargs):
     if inputs.shape.ndims != 2:
-      raise ValueError('inputs of NumericalEmbedding must have 2 dimensions.')
+      raise ValueError('inputs of PeriodicEmbedding must have 2 dimensions.')
 
     num_features = int(inputs.shape[-1])
     with tf.variable_scope(self.scope):
@@ -37,3 +41,40 @@ def __call__(self, inputs, *args, **kwargs):
         embedding = tf.layers.dense(embedding, int(embedding.shape[-1]), activation=tf.nn.relu)
         embeddings.append(embedding)
       return tf.concat(embeddings, axis=1)
+
+
+class AutoDisEmbedding(object):
+  def __init__(self, config, scope='auto_dis'):
+    """An Embedding Learning Framework for Numerical Features in CTR Prediction.
+
+    Refer: https://arxiv.org/pdf/2012.08986v2.pdf
+    """
+    self.config = config
+    self.emb_dim = config.embedding_dim
+    self.num_bins = config.num_bins
+    self.scope = scope
+
+  def __call__(self, inputs, *args, **kwargs):
+    if inputs.shape.ndims != 2:
+      raise ValueError('inputs of PeriodicEmbedding must have 2 dimensions.')
+
+    num_features = int(inputs.shape[-1])
+    with tf.variable_scope(self.scope):
+      meta_emb = tf.get_variable('meta_embedding', shape=[1, num_features, self.num_bins, self.emb_dim])
+      w = tf.get_variable('project_w', shape=[1, num_features, self.num_bins])
+      mat = tf.get_variable('project_mat', shape=[1, num_features, self.num_bins, self.num_bins])
+
+      x = tf.expand_dims(inputs, axis=-1)  # [B, num_fea, 1]
+      hidden = tf.nn.leaky_relu(w * x)  # [B, num_fea, num_bin]
+
+      y = tf.matmul(mat, tf.expand_dims(hidden, axis=-1))  # [B, num_fea, num_bin, 1]
+      y = tf.squeeze(y, axis=3)  # [B, num_fea, num_bin]
+
+      # keep_prob(float): if dropout_flag is True, keep_prob rate to keep connect; (float, keep_prob=0.8)
+      alpha = self.config.keep_prob
+      x_bar = y + alpha * hidden  # [B, num_fea, num_bin]
+      x_hat = tf.nn.softmax(x_bar / self.config.temperature)  # [B, num_fea, num_bin]
+
+      emb = tf.matmul(tf.expand_dims(x_hat, axis=2), meta_emb)  # [B, num_fea, 1, emb_dim]
+      # emb = tf.squeeze(emb, axis=2)  # [B, num_fea, emb_dim]
+      return tf.reshape(emb, [-1, self.emb_dim * num_features])  # [B, num_fea*emb_dim]
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index 4f4368b9f..2b4ccfd21 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -29,6 +29,15 @@ def __init__(self,
     if self._labels is not None:
       self._label_name = list(self._labels.keys())[0]
 
+  def build_predict_graph(self):
+    if not self.has_backbone:
+      raise NotImplementedError('method `build_predict_graph` must be implemented when backbone network do not exits')
+
+    net = self.backbone
+    output = tf.layers.dense(net, self._num_class, name='output')
+    self._add_to_prediction_dict(output)
+    return self._prediction_dict
+
   def _output_to_prediction_impl(self,
                                  output,
                                  loss_type,
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
index 7b128afe4..f17b22a10 100644
--- a/easy_rec/python/protos/backbone.proto
+++ b/easy_rec/python/protos/backbone.proto
@@ -2,38 +2,29 @@ syntax = "proto2";
 package protos;
 
 import "easy_rec/python/protos/dnn.proto";
+import "easy_rec/python/protos/layer.proto";
 import "easy_rec/python/protos/fibinet.proto";
 import "easy_rec/python/protos/masknet.proto";
 
-message NumericalEmbedding {
-    required uint32 embedding_dim = 1;
-    required float  coef_stddev = 2 [default = 1.0];
-}
 
 message SequenceLayer {
     optional DNN mlp = 1;
 }
 
-message InputLayer {
-    optional bool do_batch_norm = 1;
-    optional bool do_layer_norm = 2;
-    optional float dropout_rate = 3;
-    optional float feature_dropout_rate = 4;
-    optional bool output_feature_list = 5;
-}
-
 message Block {
     required string name = 1;
     // the input names of feature groups or other blocks
     repeated string inputs = 2;
     oneof layer {
-        InputLayer input_layer = 100;
-        NumericalEmbedding numerical_embedding = 101;
-        SequenceLayer sequence_encoder = 102;
-        MaskNet masknet = 103;
-        SENet senet = 104;
-        FiBiNetTower fibinet = 105;
-        DNN mlp = 106;
+        InputLayer input_layer = 101;
+        DNN mlp = 102;
+        PeriodicEmbedding periodic_embedding = 103;
+        AutoDisEmbedding auto_dis_embedding = 104;
+        SequenceLayer sequence_encoder = 105;
+        HighWayTower highway = 106;
+        MaskNet masknet = 107;
+        SENet senet = 108;
+        FiBiNetTower fibinet = 109;
     }
 }
 
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index faa78a0bf..c6a03c403 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -56,6 +56,7 @@ message EasyRecModel {
 
     // model parameters
     oneof model {
+        RankModel rank_model = 100;
         DummyModel dummy = 101;
         WideAndDeep wide_and_deep = 102;
         DeepFM deepfm = 103;
@@ -111,6 +112,4 @@ message EasyRecModel {
     required LossWeightStrategy loss_weight_strategy = 16 [default = Fixed];
 
     optional BackboneTower backbone = 17;
-
-    // optional bool do_feature_normalize = 18;
 }
diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto
index 17e501361..e8b3b2c4f 100644
--- a/easy_rec/python/protos/feature_config.proto
+++ b/easy_rec/python/protos/feature_config.proto
@@ -3,7 +3,7 @@ package protos;
 
 import "easy_rec/python/protos/hyperparams.proto";
 import "easy_rec/python/protos/dnn.proto";
-import "easy_rec/python/protos/layer.proto";
+import "easy_rec/python/protos/seq_encoder.proto";
 enum WideOrDeep {
     DEEP = 0;
     WIDE = 1;
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index b2ac0d789..5c7bb81a1 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -3,56 +3,29 @@ package protos;
 
 import "easy_rec/python/protos/dnn.proto";
 
-message HighWayTower {
-    required string input = 1;
-    required uint32 emb_size = 2;
+message InputLayer {
+    optional bool do_batch_norm = 1;
+    optional bool do_layer_norm = 2;
+    optional float dropout_rate = 3;
+    optional float feature_dropout_rate = 4;
+    optional bool output_feature_list = 5;
 }
 
-
-
-message SequenceEncoder {
-    // encoder parameters
-    oneof encoder {
-        BSTEncoder bst = 101;
-        DINEncoder din = 102;
-    }
-    required bool force_share_embeddings = 1 [default = true];
+message HighWayTower {
+    optional string input = 1;
+    required uint32 emb_size = 2;
+    required string activation = 3 [default = 'gelu'];
+    optional float dropout_rate = 4;
 }
 
-message BSTEncoder {
-    // Size of the encoder layers and the pooler layer
-    required uint32 hidden_size = 1;
-    // Number of hidden layers in the Transformer encoder
-    required uint32 num_hidden_layers = 2;
-    // Number of attention heads for each attention layer in the Transformer encoder
-    required uint32 num_attention_heads = 3;
-    // The size of the "intermediate" (i.e. feed-forward) layer in the Transformer encoder
-    required uint32 intermediate_size = 4;
-    // The non-linear activation function (function or string) in the encoder and pooler.
-    required string hidden_act = 5 [default = 'gelu'];  // "gelu", "relu", "tanh" and "swish" are supported.
-    // The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
-    required float hidden_dropout_prob = 6 [default = 0.1];
-    // The dropout ratio for the attention probabilities
-    required float attention_probs_dropout_prob = 7 [default = 0.1];
-    // The maximum sequence length that this model might ever be used with
-    required uint32 max_position_embeddings = 8 [default = 512];
-    // Whether to add position embeddings for the position of each token in the text sequence
-    required bool use_position_embeddings = 9 [default = true];
-    // The stddev of the truncated_normal_initializer for initializing all weight matrices
-    required float initializer_range = 10 [default = 0.02];
-    // need contrastive learning
-    required bool need_contrastive_learning = 11 [default = false];
-    // the weight of contrastive learning loss
-    optional float contrastive_loss_weight = 12 [default = 1.0];
-    // whether need auto learn contrastive loss weight
-    optional bool auto_contrastive_loss_weight = 13 [default = false];
+message PeriodicEmbedding {
+    required uint32 embedding_dim = 1;
+    required float  coef_stddev = 2 [default = 1.0];
 }
 
-message DINEncoder {
-    // din attention layer
-    required DNN attention_dnn = 1;
-    // whether to keep target item feature
-    required bool need_target_feature = 2 [default = true];
-    // option: softmax, sigmoid
-    required string attention_normalizer = 3 [default = 'softmax'];
-}
+message AutoDisEmbedding {
+    required uint32 embedding_dim = 1;
+    required uint32 num_bins = 2;
+    required float keep_prob = 3 [default = 0.8];
+    required float temperature = 4;
+}
\ No newline at end of file
diff --git a/easy_rec/python/protos/seq_encoder.proto b/easy_rec/python/protos/seq_encoder.proto
new file mode 100644
index 000000000..7a608af18
--- /dev/null
+++ b/easy_rec/python/protos/seq_encoder.proto
@@ -0,0 +1,53 @@
+syntax = "proto2";
+package protos;
+
+import "easy_rec/python/protos/dnn.proto";
+
+
+message SequenceEncoder {
+    // encoder parameters
+    oneof encoder {
+        BSTEncoder bst = 101;
+        DINEncoder din = 102;
+    }
+    required bool force_share_embeddings = 1 [default = true];
+}
+
+message BSTEncoder {
+    // Size of the encoder layers and the pooler layer
+    required uint32 hidden_size = 1;
+    // Number of hidden layers in the Transformer encoder
+    required uint32 num_hidden_layers = 2;
+    // Number of attention heads for each attention layer in the Transformer encoder
+    required uint32 num_attention_heads = 3;
+    // The size of the "intermediate" (i.e. feed-forward) layer in the Transformer encoder
+    required uint32 intermediate_size = 4;
+    // The non-linear activation function (function or string) in the encoder and pooler.
+    required string hidden_act = 5 [default = 'gelu'];  // "gelu", "relu", "tanh" and "swish" are supported.
+    // The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
+    required float hidden_dropout_prob = 6 [default = 0.1];
+    // The dropout ratio for the attention probabilities
+    required float attention_probs_dropout_prob = 7 [default = 0.1];
+    // The maximum sequence length that this model might ever be used with
+    required uint32 max_position_embeddings = 8 [default = 512];
+    // Whether to add position embeddings for the position of each token in the text sequence
+    required bool use_position_embeddings = 9 [default = true];
+    // The stddev of the truncated_normal_initializer for initializing all weight matrices
+    required float initializer_range = 10 [default = 0.02];
+    // need contrastive learning
+    required bool need_contrastive_learning = 11 [default = false];
+    // the weight of contrastive learning loss
+    optional float contrastive_loss_weight = 12 [default = 1.0];
+    // whether need auto learn contrastive loss weight
+    optional bool auto_contrastive_loss_weight = 13 [default = false];
+}
+
+message DINEncoder {
+    // din attention layer
+    required DNN attention_dnn = 1;
+    // whether to keep target item feature
+    required bool need_target_feature = 2 [default = true];
+    // option: softmax, sigmoid
+    required string attention_normalizer = 3 [default = 'softmax'];
+}
+

From b1cb609d02876a4e82b84d6cd2451663573580f2 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 12 Jun 2023 16:09:20 +0800
Subject: [PATCH 30/54] [feat]: add backbone network

---
 easy_rec/python/layers/fibinet.py            |   2 +-
 easy_rec/python/protos/easy_rec_model.proto  |   5 +-
 easy_rec/python/protos/fibinet.proto         |   2 +-
 examples/configs/fibinet_on_movielens.config | 197 +++++++++++++++++++
 examples/configs/masknet_on_movielens.config | 194 ++++++++++++++++++
 examples/readme.md                           |   2 +
 6 files changed, 399 insertions(+), 3 deletions(-)
 create mode 100644 examples/configs/fibinet_on_movielens.config
 create mode 100644 examples/configs/masknet_on_movielens.config

diff --git a/easy_rec/python/layers/fibinet.py b/easy_rec/python/layers/fibinet.py
index d112561ff..4ba15789e 100644
--- a/easy_rec/python/layers/fibinet.py
+++ b/easy_rec/python/layers/fibinet.py
@@ -32,7 +32,7 @@ def __call__(self, inputs, is_training, l2_reg=None, *args, **kwargs):
     if self._config.HasField('bilinear'):
       conf = self._config.bilinear
       bilinear = BiLinear(
-          output_size=conf.output_units,
+          output_size=conf.num_output_units,
           bilinear_type=conf.type,
           bilinear_plus=conf.use_plus,
           name='%s_bilinear' % self.name)
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index c6a03c403..49a5a9592 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -25,9 +25,12 @@ import "easy_rec/python/protos/loss.proto";
 import "easy_rec/python/protos/rocket_launching.proto";
 import "easy_rec/python/protos/variational_dropout.proto";
 import "easy_rec/python/protos/multi_tower_recall.proto";
+
 // for input performance test
 message DummyModel {
-
+}
+// configure backbone network in a free style way
+message RankModel {
 }
 
 // for knowledge distillation
diff --git a/easy_rec/python/protos/fibinet.proto b/easy_rec/python/protos/fibinet.proto
index 124bebfe4..1d48448eb 100644
--- a/easy_rec/python/protos/fibinet.proto
+++ b/easy_rec/python/protos/fibinet.proto
@@ -13,7 +13,7 @@ message SENet {
 message Bilinear {
     required string type = 1 [default = 'interaction'];
     required bool use_plus = 2 [default = true];
-    required uint32 output_units = 3;
+    required uint32 num_output_units = 3;
 }
 
 message FiBiNetTower {
diff --git a/examples/configs/fibinet_on_movielens.config b/examples/configs/fibinet_on_movielens.config
new file mode 100644
index 000000000..8508172c6
--- /dev/null
+++ b/examples/configs/fibinet_on_movielens.config
@@ -0,0 +1,197 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/fibinet_on_movieslen_ckpt"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 100
+  sync_replicas: True
+  num_steps: 2500
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: 'all'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: "emb_list"
+      inputs: "all"
+      input_layer {
+        do_batch_norm: true
+        output_feature_list: true
+      }
+    }
+    blocks {
+      name: "fibinet"
+      inputs: "emb_list"
+      fibinet {
+        senet {
+          reduction_ratio: 4
+        }
+        bilinear {
+          type: 'each'
+          num_output_units: 512
+        }
+        mlp {
+          hidden_units: [512, 256]
+        }
+      }
+    }
+    concat_blocks: ['fibinet']
+  }
+  rank_model {
+  }
+  embedding_regularization: 1e-4
+}
+export_config {
+  multi_placeholder: false
+}
diff --git a/examples/configs/masknet_on_movielens.config b/examples/configs/masknet_on_movielens.config
new file mode 100644
index 000000000..4c7f507b9
--- /dev/null
+++ b/examples/configs/masknet_on_movielens.config
@@ -0,0 +1,194 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/masknet_on_movieslen_ckpt"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 100
+  sync_replicas: True
+  num_steps: 2500
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: 'all'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: "mask_net"
+      inputs: "all"
+      masknet {
+        mask_blocks {
+          aggregation_size: 512
+          output_size: 256
+        }
+        mask_blocks {
+          aggregation_size: 512
+          output_size: 256
+        }
+        mask_blocks {
+          aggregation_size: 512
+          output_size: 256
+        }
+        mlp {
+          hidden_units: [512, 256]
+        }
+      }
+    }
+    concat_blocks: ['mask_net']
+  }
+  rank_model {
+  }
+  embedding_regularization: 1e-4
+}
+export_config {
+  multi_placeholder: false
+}
diff --git a/examples/readme.md b/examples/readme.md
index 4861b0b42..8fa32e511 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -207,6 +207,8 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
   | DeepFM    | 1     | 0.8688 |
   | DCN       | 1     | 0.8576 |
   | AutoInt   | 1     | 0.8513 |
+  | MaskNet   | 1     | 0.8872 |
+  | FibiNet   | 1     | 0.8879 |
 
 - Criteo-Research
 

From 383cbed66851329960de1a140e5e133584cdcc07 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 12 Jun 2023 22:18:12 +0800
Subject: [PATCH 31/54] [feat]: add test config for backbone network

---
 easy_rec/python/layers/backbone.py            |  38 +-
 easy_rec/python/layers/fm.py                  |  36 +-
 easy_rec/python/protos/backbone.proto         |   2 +
 easy_rec/python/protos/easy_rec_model.proto   |   1 +
 .../configs/deepfm_backbone_on_criteo.config  | 560 ++++++++++++++++++
 ...pfm_backbone_on_criteo_with_autodis.config | 560 ++++++++++++++++++
 .../deepfm_backbone_on_movielens.config       | 194 ++++++
 examples/configs/deepfm_on_criteo.config      |  26 +-
 examples/data/criteo/download_and_process.sh  |   3 +-
 examples/data/criteo/process_criteo_kaggle.py |   5 +-
 examples/rank_model/readme.md                 |   4 +-
 examples/readme.md                            |   4 +
 12 files changed, 1395 insertions(+), 38 deletions(-)
 create mode 100644 examples/configs/deepfm_backbone_on_criteo.config
 create mode 100644 examples/configs/deepfm_backbone_on_criteo_with_autodis.config
 create mode 100644 examples/configs/deepfm_backbone_on_movielens.config

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 3e95ba709..8caa31b80 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -10,6 +10,7 @@
 from easy_rec.python.layers.numerical_embedding import PeriodicEmbedding, AutoDisEmbedding
 from easy_rec.python.layers.fibinet import FiBiNetLayer
 from easy_rec.python.layers.mask_net import MaskNet
+from easy_rec.python.layers.fm import FMLayer
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -96,8 +97,8 @@ def block_input(self, config, block_outputs):
   def __call__(self, is_training, *args, **kwargs):
     block_outputs = {}
     blocks = self._dag.topological_sort()
-    logging.info("backbone topological: " + ','.join(blocks))
-    print("backbone topological: " + ','.join(blocks))
+    logging.info("backbone topological order: " + ','.join(blocks))
+    print("backbone topological order: " + ','.join(blocks))
     for block in blocks:
       config = self._name_to_blocks[block]
       layer = config.WhichOneof('layer')
@@ -108,60 +109,59 @@ def __call__(self, is_training, *args, **kwargs):
         output = input_layer(config.inputs[0], is_training)
         block_outputs[block] = output
       elif layer == 'periodic_embedding':
+        input_feature = self.block_input(config, block_outputs)
         conf = config.periodic_embedding
         num_emb = PeriodicEmbedding(conf.embedding_dim, stddev=conf.coef_stddev, scope=block)
-        input_feature = self.block_input(config, block_outputs)
         block_outputs[block] = num_emb(input_feature)
       elif layer == 'auto_dis_embedding':
-        conf = config.auto_dis_embedding
-        num_emb = AutoDisEmbedding(conf, scope=block)
         input_feature = self.block_input(config, block_outputs)
+        num_emb = AutoDisEmbedding(config.auto_dis_embedding, scope=block)
         block_outputs[block] = num_emb(input_feature)
       elif layer == 'highway':
-        conf = config.highway
         input_feature = self.block_input(config, block_outputs)
-        highway_fea = highway(
+        conf = config.highway
+        highway_layer = highway(
           input_feature,
           conf.emb_size,
           activation=conf.activation,
           dropout=conf.dropout_rate,
           scope=block)
-        block_outputs[block] = highway_fea(input_feature)
+        block_outputs[block] = highway_layer(input_feature)
       elif layer == 'mlp':
+        input_feature = self.block_input(config, block_outputs)
         mlp = dnn.DNN(
           config.mlp,
           self._l2_reg,
           name='%s_mlp' % block,
           is_training=is_training)
-        input_feature = self.block_input(config, block_outputs)
-        output = mlp(input_feature)
-        block_outputs[block] = output
+        block_outputs[block] = mlp(input_feature)
       elif layer == 'sequence_encoder':
         block_outputs[block] = self.sequence_encoder(config, is_training)
       elif layer == 'masknet':
-        conf = config.masknet
+        input_feature = self.block_input(config, block_outputs)
         mask_net = MaskNet(
-          conf,
+          config.masknet,
           name=block,
           reuse=tf.AUTO_REUSE)
-        input_feature = self.block_input(config, block_outputs)
         output = mask_net(
           input_feature, is_training, l2_reg=self._l2_reg)
         block_outputs[block] = output
       elif layer == 'senet':
-        conf = config.senet
-        senet = SENet(conf, name=block)
         input_feature = self.block_input(config, block_outputs)
+        senet = SENet(config.senet, name=block)
         output = senet(input_feature)
         block_outputs[block] = output
       elif layer == 'fibinet':
-        conf = config.fibinet
-        fibinet = FiBiNetLayer(conf, name=block)
         input_feature = self.block_input(config, block_outputs)
+        fibinet = FiBiNetLayer(config.fibinet, name=block)
         output = fibinet(input_feature, is_training, l2_reg=self._l2_reg)
         block_outputs[block] = output
+      elif layer == 'fm':
+        input_feature = self.block_input(config, block_outputs)
+        fm = FMLayer()
+        block_outputs[block] = fm(input_feature)
       else:
-        raise ValueError('Unsupported backbone layer:' + layer)
+        raise NotImplementedError('Unsupported backbone layer:' + layer)
 
     temp = []
     for output in self._config.concat_blocks:
diff --git a/easy_rec/python/layers/fm.py b/easy_rec/python/layers/fm.py
index c638456a4..198d6b8d6 100644
--- a/easy_rec/python/layers/fm.py
+++ b/easy_rec/python/layers/fm.py
@@ -19,9 +19,41 @@ def __init__(self, name='fm'):
 
   def __call__(self, fm_fea):
     with tf.name_scope(self._name):
-      fm_feas = tf.concat(fm_fea, axis=1)
-      fm_feas = tf.expand_dims(fm_feas, axis=1)
+      fm_feas = [tf.expand_dims(x, axis=1) for x in fm_fea]
+      fm_feas = tf.concat(fm_feas, axis=1)
       sum_square = tf.square(tf.reduce_sum(fm_feas, 1))
       square_sum = tf.reduce_sum(tf.square(fm_feas), 1)
       y_v = 0.5 * tf.subtract(sum_square, square_sum)
     return y_v
+
+
+class FMLayer(object):
+  """Factorization Machine models pairwise (order-2) feature interactions
+   without linear term and bias.
+    Input shape
+      - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
+      - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
+    Output shape
+      - 2D tensor with shape: ``(batch_size, 1)``.
+    References
+      - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+  """
+  def __call__(self, inputs):
+    if type(inputs) == list:
+      emb_dims = set()
+      for x in inputs:
+        emb_dims.add(int(x.shape[-1]))
+      assert len(emb_dims) == 1, 'all embedding dim must be the same in FM layer:' + ','.join([str(d) for d in emb_dims])
+      num_fea = len(inputs)
+      emb_dim = emb_dims.pop()
+      fea = tf.concat(inputs, axis=-1)
+      fea = tf.reshape(fea, [-1, num_fea, emb_dim])
+    else:
+      assert inputs.shape.ndims == 3, 'input of FM layer must be a 3D tensor or a list of 2D tensors'
+      fea = inputs
+
+    square_of_sum = tf.square(tf.reduce_sum(fea, axis=1, keepdims=True))
+    sum_of_square = tf.reduce_sum(fea * fea, axis=1, keepdims=True)
+    cross_term = square_of_sum - sum_of_square
+    cross_term = 0.5 * tf.reduce_sum(cross_term, axis=2, keepdims=False)
+    return cross_term
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
index f17b22a10..3dc86cebb 100644
--- a/easy_rec/python/protos/backbone.proto
+++ b/easy_rec/python/protos/backbone.proto
@@ -2,6 +2,7 @@ syntax = "proto2";
 package protos;
 
 import "easy_rec/python/protos/dnn.proto";
+import "easy_rec/python/protos/fm.proto";
 import "easy_rec/python/protos/layer.proto";
 import "easy_rec/python/protos/fibinet.proto";
 import "easy_rec/python/protos/masknet.proto";
@@ -25,6 +26,7 @@ message Block {
         MaskNet masknet = 107;
         SENet senet = 108;
         FiBiNetTower fibinet = 109;
+        FM fm = 110;
     }
 }
 
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 49a5a9592..3f4f851b9 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -31,6 +31,7 @@ message DummyModel {
 }
 // configure backbone network in a free style way
 message RankModel {
+    optional float l2_regularization = 1;
 }
 
 // for knowledge distillation
diff --git a/examples/configs/deepfm_backbone_on_criteo.config b/examples/configs/deepfm_backbone_on_criteo.config
new file mode 100644
index 000000000..a0982a16e
--- /dev/null
+++ b/examples/configs/deepfm_backbone_on_criteo.config
@@ -0,0 +1,560 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/deepfm_criteo_ckpt"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 1000
+  sync_replicas: True
+  num_steps: 20000
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "features"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'emb_list'
+      inputs: 'features'
+      input_layer {
+        output_feature_list: true
+      }
+    }
+    blocks {
+      name: 'fm'
+      inputs: 'emb_list'
+      fm {}
+    }
+    blocks {
+      name: 'deep'
+      inputs: 'features'
+      mlp {
+        hidden_units: [256, 128, 64]
+      }
+    }
+    concat_blocks: ['fm', 'deep']
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
new file mode 100644
index 000000000..1dcdf7512
--- /dev/null
+++ b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
@@ -0,0 +1,560 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/deepfm_autodis_criteo_ckpt"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 1000
+  sync_replicas: True
+  num_steps: 20000
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "features"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'emb_list'
+      inputs: 'features'
+      input_layer {
+        output_feature_list: true
+      }
+    }
+    blocks {
+      name: 'fm'
+      inputs: 'emb_list'
+      fm {}
+    }
+    blocks {
+      name: 'deep'
+      inputs: 'features'
+      mlp {
+        hidden_units: [256, 128, 64]
+      }
+    }
+    concat_blocks: ['fm', 'deep']
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/deepfm_backbone_on_movielens.config b/examples/configs/deepfm_backbone_on_movielens.config
new file mode 100644
index 000000000..46a79d83b
--- /dev/null
+++ b/examples/configs/deepfm_backbone_on_movielens.config
@@ -0,0 +1,194 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/deepfm_backbone_movieslen_ckpt"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 100
+  sync_replicas: True
+  num_steps: 2500
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [8, 4, 4]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: 'features'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    feature_names: 'title'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: 'emb_list'
+      inputs: 'features'
+      input_layer {
+        output_feature_list: true
+      }
+    }
+    blocks {
+      name: 'fm'
+      inputs: 'emb_list'
+      fm {}
+    }
+    blocks {
+      name: 'deep'
+      inputs: 'features'
+      mlp {
+        hidden_units: [256, 128, 64]
+      }
+    }
+    concat_blocks: ['fm', 'deep']
+  }
+  rank_model {
+    l2_regularization: 1e-4
+  }
+  embedding_regularization: 1e-4
+}
+export_config {
+  multi_placeholder: false
+}
diff --git a/examples/configs/deepfm_on_criteo.config b/examples/configs/deepfm_on_criteo.config
index c482cf246..fc8537f0d 100644
--- a/examples/configs/deepfm_on_criteo.config
+++ b/examples/configs/deepfm_on_criteo.config
@@ -241,91 +241,91 @@ data_config {
 feature_config: {
   features: {
     input_names: "F1"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val:0.0
     max_val: 5775.0
   }
   features: {
     input_names: "F2"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: -3.0
     max_val: 257675.0
   }
   features: {
     input_names: "F3"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 65535.0
   }
   features: {
     input_names: "F4"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 969.0
   }
   features: {
     input_names: "F5"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 23159456.0
   }
   features: {
     input_names: "F6"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 431037.0
   }
   features: {
     input_names: "F7"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 56311.0
   }
   features: {
     input_names: "F8"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 6047.0
   }
   features: {
     input_names: "F9"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 29019.0
   }
   features: {
     input_names: "F10"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 46.0
   }
   features: {
     input_names: "F11"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 231.0
   }
   features: {
     input_names: "F12"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 4008.0
   }
   features: {
     input_names: "F13"
-    embedding_dim:10
+    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 7393.0
diff --git a/examples/data/criteo/download_and_process.sh b/examples/data/criteo/download_and_process.sh
index 30061a862..f0cc8aef9 100644
--- a/examples/data/criteo/download_and_process.sh
+++ b/examples/data/criteo/download_and_process.sh
@@ -1,6 +1,7 @@
 #! /bin/bash
 if [ "$(uname)" == "Darwin" ]; then
-    curl -O https://easy-rec.oss-cn-hangzhou.aliyuncs.com/data/criteo_kaggle/kaggle-display-advertising-challenge-dataset.tar.gz
+    #curl -O https://easy-rec.oss-cn-hangzhou.aliyuncs.com/data/criteo_kaggle/kaggle-display-advertising-challenge-dataset.tar.gz
+    wget -c https://easy-rec.oss-cn-hangzhou.aliyuncs.com/data/criteo_kaggle/kaggle-display-advertising-challenge-dataset.tar.gz
 elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
     wget -c https://easy-rec.oss-cn-hangzhou.aliyuncs.com/data/criteo_kaggle/kaggle-display-advertising-challenge-dataset.tar.gz
 elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then
diff --git a/examples/data/criteo/process_criteo_kaggle.py b/examples/data/criteo/process_criteo_kaggle.py
index 60b7d9776..5b9cb4f34 100644
--- a/examples/data/criteo/process_criteo_kaggle.py
+++ b/examples/data/criteo/process_criteo_kaggle.py
@@ -11,8 +11,9 @@
 samples_num = data_train.shape[0]
 print('samples_num:', samples_num, round(samples_num * 0.9))
 
-data_train[:round(samples_num * 0.9)].to_csv(
+train_num = int(round(samples_num * 0.9))
+data_train[:train_num].to_csv(
     r'criteo_train_data', index=False, sep='\t', mode='a', header=False)
-data_train[round(samples_num * 0.9):].to_csv(
+data_train[train_num:].to_csv(
     r'criteo_test_data', index=False, sep='\t', mode='a', header=False)
 print('Done.')
diff --git a/examples/rank_model/readme.md b/examples/rank_model/readme.md
index 15d3f4dca..f6a2ba791 100644
--- a/examples/rank_model/readme.md
+++ b/examples/rank_model/readme.md
@@ -32,10 +32,12 @@
 | MovieLens-1M | DeepFM    | 0.8688 |
 | MovieLens-1M | DCN       | 0.8576 |
 | MovieLens-1M | AutoInt   | 0.8513 |
+| MovieLens-1M | MaskNet   | 0.8872 |
+| MovieLens-1M | FibiNet   | 0.8879 |
 
 # Criteo Research Kaggle 数据集
 
-在MovieLens-1M 数据集中, 我们提供了2个模型上的demo示例。
+在 `Criteo Research Kaggle` 数据集中, 我们提供了2个模型上的demo示例。
 
 [FM](fm.md) / [DeepFM](deepfm.md)
 
diff --git a/examples/readme.md b/examples/readme.md
index 8fa32e511..b95adc8b1 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -77,6 +77,10 @@ EasyRec的模型训练和评估都是基于config配置文件的，配置文件
 
 - [autoint_on_movielens.config](configs/autoint_on_movielens.config)
 
+- [masknet_on_movielens.config](configs/masknet_on_movielens.config)
+
+- [fibinet_on_movielens.config](configs/fibinet_on_movielens.config)
+
 - [fm_on_criteo.config](configs/fm_on_criteo.config)
 
 - [deepfm_on_criteo.config](configs/deepfm_on_criteo.config)

From 1114aab534cbe991de9f061c6b1e1cca6d8bd5b3 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Wed, 14 Jun 2023 17:06:35 +0800
Subject: [PATCH 32/54] [feat]: add more backbone blocks

---
 easy_rec/python/compat/array_ops.py           |   2 +-
 .../compat/feature_column/feature_column.py   | 895 +++++++++---------
 .../feature_column/feature_column_v2.py       |  10 +
 easy_rec/python/input/input.py                |   7 +-
 easy_rec/python/layers/backbone.py            | 239 +++--
 easy_rec/python/layers/common_layers.py       |  83 ++
 easy_rec/python/layers/fibinet.py             |   8 +-
 easy_rec/python/layers/fm.py                  |  51 +-
 easy_rec/python/layers/input_layer.py         |  32 +-
 easy_rec/python/layers/mask_net.py            |   9 +-
 easy_rec/python/layers/numerical_embedding.py | 152 ++-
 easy_rec/python/model/easy_rec_estimator.py   |   3 +-
 easy_rec/python/model/easy_rec_model.py       |  18 +-
 easy_rec/python/model/rank_model.py           | 206 ++--
 easy_rec/python/protos/backbone.proto         |  44 +-
 easy_rec/python/protos/dnn.proto              |  13 +
 easy_rec/python/protos/easy_rec_model.proto   |   4 +-
 easy_rec/python/protos/fm.proto               |   1 +
 easy_rec/python/protos/layer.proto            |  23 +-
 easy_rec/python/protos/seq_encoder.proto      |   1 -
 easy_rec/python/train_eval.py                 |   5 +
 easy_rec/python/utils/__init__.py             |  17 +
 easy_rec/python/utils/dag.py                  | 398 ++++----
 easy_rec/python/utils/tf_utils.py             |  36 +
 .../configs/deepfm_backbone_on_criteo.config  |  95 +-
 ...pfm_backbone_on_criteo_with_autodis.config | 119 +--
 ...fm_backbone_on_criteo_with_periodic.config | 571 +++++++++++
 .../configs/dlrm_backbone_on_criteo.config    | 566 +++++++++++
 examples/readme.md                            |   8 +
 29 files changed, 2586 insertions(+), 1030 deletions(-)
 create mode 100644 examples/configs/deepfm_backbone_on_criteo_with_periodic.config
 create mode 100644 examples/configs/dlrm_backbone_on_criteo.config

diff --git a/easy_rec/python/compat/array_ops.py b/easy_rec/python/compat/array_ops.py
index 3e8929ceb..d788bc8c1 100644
--- a/easy_rec/python/compat/array_ops.py
+++ b/easy_rec/python/compat/array_ops.py
@@ -194,7 +194,7 @@ def repeat_with_axis(data, repeats, axis, name=None):
 
 
 def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
-  """Repeat elements of `input`
+  """Repeat elements of `input`.
 
   Args:
     input: An `N`-dimensional Tensor.
diff --git a/easy_rec/python/compat/feature_column/feature_column.py b/easy_rec/python/compat/feature_column/feature_column.py
index 7d8419528..d0f23dfbb 100644
--- a/easy_rec/python/compat/feature_column/feature_column.py
+++ b/easy_rec/python/compat/feature_column/feature_column.py
@@ -179,15 +179,15 @@ def _internal_input_layer(features,
                           cols_to_output_tensors=None,
                           from_template=False,
                           feature_name_to_output_tensors=None,
-                          do_normalize=False):
+                          sort_feature_columns_by_name=True):
   """See input_layer, `scope` is a name or variable scope to use."""
   feature_columns = _normalize_feature_columns(feature_columns)
   for column in feature_columns:
     if not isinstance(column, _DenseColumn):
       raise ValueError(
-          'Items of feature_columns must be a _DenseColumn. '
-          'You can wrap a categorical column with an '
-          'embedding_column or indicator_column. Given: {}'.format(column))
+        'Items of feature_columns must be a _DenseColumn. '
+        'You can wrap a categorical column with an '
+        'embedding_column or indicator_column. Given: {}'.format(column))
   weight_collections = list(weight_collections or [])
   if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
     weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
@@ -197,42 +197,28 @@ def _internal_input_layer(features,
   def _get_logits():  # pylint: disable=missing-docstring
     builder = _LazyBuilder(features)
     output_tensors = []
-    ordered_columns = []
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
+    if sort_feature_columns_by_name:
+      ordered_columns = sorted(feature_columns, key=lambda x: x.name)
+    else:
+      ordered_columns = feature_columns
+    for column in ordered_columns:
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name):  # pylint: disable=protected-access
         tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-            builder,
-            weight_collections=weight_collections,
-            trainable=trainable)
+          builder,
+          weight_collections=weight_collections,
+          trainable=trainable)
         num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
         batch_size = array_ops.shape(tensor)[0]
         output_tensor = array_ops.reshape(
-            tensor, shape=(batch_size, num_elements))
-        if do_normalize:
-          from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn,\
-            NumericColumn, WeightedCategoricalColumn
-          from tensorflow.python.layers.normalization import batch_normalization
-          if isinstance(column, EmbeddingColumn) or isinstance(
-              column, _SharedEmbeddingColumn):
-            fc = column.categorical_column
-            if isinstance(fc, WeightedCategoricalColumn
-                          ) and fc.weight_feature_key.endswith('_raw_proj_val'):
-              output_tensor = layer_norm(
-                  output_tensor, name='ln_' + column.name)
-            else:
-              output_tensor = batch_normalization(
-                  output_tensor, name='bn_' + column.name)
-          elif isinstance(column, NumericColumn) and int(column.shape[-1]) > 1:
-            output_tensor = layer_norm(output_tensor, name='ln_' + column.name)
+          tensor, shape=(batch_size, num_elements))
         output_tensors.append(output_tensor)
         if cols_to_vars is not None:
           # Retrieve any variables created (some _DenseColumn's don't create
           # variables, in which case an empty list is returned).
           cols_to_vars[column] = ops.get_collection(
-              ops.GraphKeys.GLOBAL_VARIABLES,
-              scope=variable_scope.get_variable_scope().name)
+            ops.GraphKeys.GLOBAL_VARIABLES,
+            scope=variable_scope.get_variable_scope().name)
         if cols_to_output_tensors is not None:
           cols_to_output_tensors[column] = output_tensor
         if feature_name_to_output_tensors is not None:
@@ -258,7 +244,7 @@ def input_layer(features,
                 cols_to_vars=None,
                 cols_to_output_tensors=None,
                 feature_name_to_output_tensors=None,
-                do_normalize=False):
+                sort_feature_columns_by_name=True):
   """Returns a dense `Tensor` as input layer based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -306,8 +292,7 @@ def input_layer(features,
     cols_to_output_tensors: If not `None`, must be a dictionary that will be
       filled with a mapping from '_FeatureColumn' to the associated
       output `Tensor`s.
-    do_normalize: Whether to do layer normalization for numerical features and
-      batch normalization operation for categorical features.
+    sort_feature_columns_by_name: whether to sort feature columns
 
   Returns:
     A `Tensor` which represents input layer of a model. Its shape
@@ -318,14 +303,14 @@ def input_layer(features,
     ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
   """
   return _internal_input_layer(
-      features,
-      feature_columns,
-      weight_collections=weight_collections,
-      trainable=trainable,
-      cols_to_vars=cols_to_vars,
-      cols_to_output_tensors=cols_to_output_tensors,
-      feature_name_to_output_tensors=feature_name_to_output_tensors,
-      do_normalize=do_normalize)
+    features,
+    feature_columns,
+    weight_collections=weight_collections,
+    trainable=trainable,
+    cols_to_vars=cols_to_vars,
+    cols_to_output_tensors=cols_to_output_tensors,
+    feature_name_to_output_tensors=feature_name_to_output_tensors,
+    sort_feature_columns_by_name=sort_feature_columns_by_name)
 
 
 # TODO(akshayka): InputLayer should be a subclass of Layer, and it
@@ -349,17 +334,17 @@ def __init__(self,
     self._cols_to_vars = cols_to_vars
     self._name = name
     self._input_layer_template = template.make_template(
-        self._name, _internal_input_layer, create_scope_now_=create_scope_now)
+      self._name, _internal_input_layer, create_scope_now_=create_scope_now)
     self._scope = self._input_layer_template.variable_scope
 
   def __call__(self, features):
     return self._input_layer_template(
-        features=features,
-        feature_columns=self._feature_columns,
-        weight_collections=self._weight_collections,
-        trainable=self._trainable,
-        cols_to_vars=None,
-        from_template=True)
+      features=features,
+      feature_columns=self._feature_columns,
+      weight_collections=self._weight_collections,
+      trainable=self._trainable,
+      cols_to_vars=None,
+      from_template=True)
 
   @property
   def name(self):
@@ -515,12 +500,12 @@ def linear_model(features,
   with variable_scope.variable_scope(None, 'linear_model') as vs:
     model_name = _strip_leading_slashes(vs.name)
   linear_model_layer = _LinearModel(
-      feature_columns=feature_columns,
-      units=units,
-      sparse_combiner=sparse_combiner,
-      weight_collections=weight_collections,
-      trainable=trainable,
-      name=model_name)
+    feature_columns=feature_columns,
+    units=units,
+    sparse_combiner=sparse_combiner,
+    weight_collections=weight_collections,
+    trainable=trainable,
+    name=model_name)
   retval = linear_model_layer(features)  # pylint: disable=not-callable
   if cols_to_vars is not None:
     cols_to_vars.update(linear_model_layer.cols_to_vars())
@@ -564,7 +549,7 @@ def __init__(self,
                name=None,
                **kwargs):
     super(_FCLinearWrapper, self).__init__(
-        trainable=trainable, name=name, **kwargs)
+      trainable=trainable, name=name, **kwargs)
     self._feature_column = feature_column
     self._units = units
     self._sparse_combiner = sparse_combiner
@@ -573,30 +558,30 @@ def __init__(self,
   def build(self, _):
     if isinstance(self._feature_column, _CategoricalColumn):
       weight = self.add_variable(
-          name='weights',
-          shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
-          initializer=init_ops.zeros_initializer(),
-          trainable=self.trainable)
+        name='weights',
+        shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
+        initializer=init_ops.zeros_initializer(),
+        trainable=self.trainable)
     else:
       num_elements = self._feature_column._variable_shape.num_elements()  # pylint: disable=protected-access
       weight = self.add_variable(
-          name='weights',
-          shape=[num_elements, self._units],
-          initializer=init_ops.zeros_initializer(),
-          trainable=self.trainable)
+        name='weights',
+        shape=[num_elements, self._units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=self.trainable)
     _add_to_collections(weight, self._weight_collections)
     self._weight_var = weight
     self.built = True
 
   def call(self, builder):
     weighted_sum = _create_weighted_sum(
-        column=self._feature_column,
-        builder=builder,
-        units=self._units,
-        sparse_combiner=self._sparse_combiner,
-        weight_collections=self._weight_collections,
-        trainable=self.trainable,
-        weight_var=self._weight_var)
+      column=self._feature_column,
+      builder=builder,
+      units=self._units,
+      sparse_combiner=self._sparse_combiner,
+      weight_collections=self._weight_collections,
+      trainable=self.trainable,
+      weight_var=self._weight_var)
     return weighted_sum
 
 
@@ -615,10 +600,10 @@ def __init__(self,
 
   def build(self, _):
     self._bias_variable = self.add_variable(
-        'bias_weights',
-        shape=[self._units],
-        initializer=init_ops.zeros_initializer(),
-        trainable=self.trainable)
+      'bias_weights',
+      shape=[self._units],
+      initializer=init_ops.zeros_initializer(),
+      trainable=self.trainable)
     _add_to_collections(self._bias_variable, self._weight_collections)
     self.built = True
 
@@ -674,11 +659,11 @@ def __init__(self,
       column_layers[column_name] = column_layer
     self._column_layers = self._add_layers(column_layers)
     self._bias_layer = _BiasLayer(
-        units=units,
-        trainable=trainable,
-        weight_collections=self._weight_collections,
-        name='bias_layer',
-        **kwargs)
+      units=units,
+      trainable=trainable,
+      weight_collections=self._weight_collections,
+      name='bias_layer',
+      **kwargs)
     self._cols_to_vars = {}
 
   def cols_to_vars(self):
@@ -694,8 +679,8 @@ def call(self, features):
       for column in self._feature_columns:
         if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
           raise ValueError(
-              'Items of feature_columns must be either a '
-              '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
+            'Items of feature_columns must be either a '
+            '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
       weighted_sums = []
       ordered_columns = []
       builder = _LazyBuilder(features)
@@ -705,17 +690,17 @@ def call(self, features):
         weighted_sum = layer(builder)
         weighted_sums.append(weighted_sum)
         self._cols_to_vars[column] = ops.get_collection(
-            ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
+          ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
 
       _verify_static_batch_size_equality(weighted_sums, ordered_columns)
       predictions_no_bias = math_ops.add_n(
-          weighted_sums, name='weighted_sum_no_bias')
+        weighted_sums, name='weighted_sum_no_bias')
       predictions = nn_ops.bias_add(
-          predictions_no_bias,
-          self._bias_layer(  # pylint: disable=not-callable
-              builder,
-              scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
-          name='weighted_sum')
+        predictions_no_bias,
+        self._bias_layer(  # pylint: disable=not-callable
+          builder,
+          scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
+        name='weighted_sum')
       bias = self._bias_layer.variables[0]
       self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
     return predictions
@@ -920,31 +905,31 @@ def model_fn(features, ...):
   if (initializer is not None) and (not callable(initializer)):
     raise ValueError('initializer must be callable if specified. '
                      'Embedding of column_name: {}'.format(
-                         categorical_column.name))
+      categorical_column.name))
   if initializer is None:
     initializer = init_ops.truncated_normal_initializer(
-        mean=0.0, stddev=0.01 / math.sqrt(dimension))
+      mean=0.0, stddev=0.01 / math.sqrt(dimension))
 
   embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
 
   def _creator(weight_collections, scope):
     embedding_column_layer = _EmbeddingColumnLayer(
-        embedding_shape=embedding_shape,
-        initializer=initializer,
-        weight_collections=weight_collections,
-        trainable=trainable,
-        name='embedding_column_layer')
+      embedding_shape=embedding_shape,
+      initializer=initializer,
+      weight_collections=weight_collections,
+      trainable=trainable,
+      name='embedding_column_layer')
     return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
 
   return _EmbeddingColumn(
-      categorical_column=categorical_column,
-      dimension=dimension,
-      combiner=combiner,
-      layer_creator=_creator,
-      ckpt_to_load_from=ckpt_to_load_from,
-      tensor_name_in_ckpt=tensor_name_in_ckpt,
-      max_norm=max_norm,
-      trainable=trainable)
+    categorical_column=categorical_column,
+    dimension=dimension,
+    combiner=combiner,
+    layer_creator=_creator,
+    ckpt_to_load_from=ckpt_to_load_from,
+    tensor_name_in_ckpt=tensor_name_in_ckpt,
+    max_norm=max_norm,
+    trainable=trainable)
 
 
 def _numeric_column(key,
@@ -1011,15 +996,15 @@ def _numeric_column(key,
 
   if normalizer_fn is not None and not callable(normalizer_fn):
     raise TypeError(
-        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
+      'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   fc_utils.assert_key_is_string(key)
   return _NumericColumn(
-      key,
-      shape=shape,
-      default_value=default_value,
-      dtype=dtype,
-      normalizer_fn=normalizer_fn)
+    key,
+    shape=shape,
+    default_value=default_value,
+    dtype=dtype,
+    normalizer_fn=normalizer_fn)
 
 
 def _bucketized_column(source_column, boundaries):
@@ -1090,8 +1075,8 @@ def _bucketized_column(source_column, boundaries):
   """
   if not isinstance(source_column, _NumericColumn):
     raise ValueError(
-        'source_column must be a column generated with numeric_column(). '
-        'Given: {}'.format(source_column))
+      'source_column must be a column generated with numeric_column(). '
+      'Given: {}'.format(source_column))
   if len(source_column.shape) > 1:
     raise ValueError('source_column must be one-dimensional column. '
                      'Given: {}'.format(source_column))
@@ -1154,7 +1139,7 @@ def _categorical_column_with_hash_bucket(key,
   if hash_bucket_size < 1:
     raise ValueError('hash_bucket_size must be at least 1. '
                      'hash_bucket_size: {}, key: {}'.format(
-                         hash_bucket_size, key))
+      hash_bucket_size, key))
 
   fc_utils.assert_key_is_string(key)
   fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
@@ -1256,8 +1241,8 @@ def _categorical_column_with_vocabulary_file(key,
     with gfile.GFile(vocabulary_file) as f:
       vocabulary_size = sum(1 for _ in f)
     logging.info(
-        'vocabulary_size = %d in %s is inferred from the number of elements '
-        'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
+      'vocabulary_size = %d in %s is inferred from the number of elements '
+      'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
 
   # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
   if vocabulary_size < 1:
@@ -1265,20 +1250,20 @@ def _categorical_column_with_vocabulary_file(key,
   if num_oov_buckets:
     if default_value is not None:
       raise ValueError(
-          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
-              key))
+        'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
+          key))
     if num_oov_buckets < 0:
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
-          num_oov_buckets, key))
+        num_oov_buckets, key))
   fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
   fc_utils.assert_key_is_string(key)
   return _VocabularyFileCategoricalColumn(
-      key=key,
-      vocabulary_file=vocabulary_file,
-      vocabulary_size=vocabulary_size,
-      num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
-      default_value=-1 if default_value is None else default_value,
-      dtype=dtype)
+    key=key,
+    vocabulary_file=vocabulary_file,
+    vocabulary_size=vocabulary_size,
+    num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
+    default_value=-1 if default_value is None else default_value,
+    dtype=dtype)
 
 
 def _categorical_column_with_vocabulary_list(key,
@@ -1363,38 +1348,38 @@ def _categorical_column_with_vocabulary_list(key,
   """
   if (vocabulary_list is None) or (len(vocabulary_list) < 1):
     raise ValueError(
-        'vocabulary_list {} must be non-empty, column_name: {}'.format(
-            vocabulary_list, key))
+      'vocabulary_list {} must be non-empty, column_name: {}'.format(
+        vocabulary_list, key))
   if len(set(vocabulary_list)) != len(vocabulary_list):
     raise ValueError(
-        'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
-            vocabulary_list, key))
+      'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
+        vocabulary_list, key))
   vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
   if num_oov_buckets:
     if default_value != -1:
       raise ValueError(
-          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
-              key))
+        'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
+          key))
     if num_oov_buckets < 0:
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
-          num_oov_buckets, key))
+        num_oov_buckets, key))
   fc_utils.assert_string_or_int(
-      vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
+    vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
   if dtype is None:
     dtype = vocabulary_dtype
   elif dtype.is_integer != vocabulary_dtype.is_integer:
     raise ValueError(
-        'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
-            dtype, vocabulary_dtype, key))
+      'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
+        dtype, vocabulary_dtype, key))
   fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
   fc_utils.assert_key_is_string(key)
 
   return _VocabularyListCategoricalColumn(
-      key=key,
-      vocabulary_list=tuple(vocabulary_list),
-      dtype=dtype,
-      default_value=default_value,
-      num_oov_buckets=num_oov_buckets)
+    key=key,
+    vocabulary_list=tuple(vocabulary_list),
+    dtype=dtype,
+    default_value=default_value,
+    num_oov_buckets=num_oov_buckets)
 
 
 def _categorical_column_with_identity(key, num_buckets, default_value=None):
@@ -1453,15 +1438,15 @@ def _categorical_column_with_identity(key, num_buckets, default_value=None):
   """
   if num_buckets < 1:
     raise ValueError('num_buckets {} < 1, column_name {}'.format(
-        num_buckets, key))
+      num_buckets, key))
   if (default_value is not None) and ((default_value < 0) or
                                       (default_value >= num_buckets)):
     raise ValueError(
-        'default_value {} not in range [0, {}), column_name {}'.format(
-            default_value, num_buckets, key))
+      'default_value {} not in range [0, {}), column_name {}'.format(
+        default_value, num_buckets, key))
   fc_utils.assert_key_is_string(key)
   return _IdentityCategoricalColumn(
-      key=key, num_buckets=num_buckets, default_value=default_value)
+    key=key, num_buckets=num_buckets, default_value=default_value)
 
 
 def _indicator_column(categorical_column):
@@ -1568,9 +1553,9 @@ def _weighted_categorical_column(categorical_column,
   if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype {} is not convertible to float.'.format(dtype))
   return _WeightedCategoricalColumn(
-      categorical_column=categorical_column,
-      weight_feature_key=weight_feature_key,
-      dtype=dtype)
+    categorical_column=categorical_column,
+    weight_feature_key=weight_feature_key,
+    dtype=dtype)
 
 
 def _crossed_column(keys, hash_bucket_size, hash_key=None):
@@ -1682,21 +1667,21 @@ def _crossed_column(keys, hash_bucket_size, hash_key=None):
                      'hash_bucket_size: {}'.format(hash_bucket_size))
   if not keys or len(keys) < 2:
     raise ValueError(
-        'keys must be a list with length > 1. Given: {}'.format(keys))
+      'keys must be a list with length > 1. Given: {}'.format(keys))
   for key in keys:
     if (not isinstance(key, six.string_types) and
         not isinstance(key, _CategoricalColumn)):
       raise ValueError(
-          'Unsupported key type. All keys must be either string, or '
-          'categorical column except _HashedCategoricalColumn. '
-          'Given: {}'.format(key))
+        'Unsupported key type. All keys must be either string, or '
+        'categorical column except _HashedCategoricalColumn. '
+        'Given: {}'.format(key))
     if isinstance(key, _HashedCategoricalColumn):
       raise ValueError(
-          'categorical_column_with_hash_bucket is not supported for crossing. '
-          'Hashing before crossing will increase probability of collision. '
-          'Instead, use the feature name as a string. Given: {}'.format(key))
+        'categorical_column_with_hash_bucket is not supported for crossing. '
+        'Hashing before crossing will increase probability of collision. '
+        'Instead, use the feature name as a string. Given: {}'.format(key))
   return _CrossedColumn(
-      keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)
+    keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)
 
 
 # TODO(rohanj): Clearly define semantics of this layer.
@@ -1725,7 +1710,7 @@ def __init__(self,
       **kwargs: keyword named properties.
     """
     super(_EmbeddingColumnLayer, self).__init__(
-        trainable=trainable, name=name, **kwargs)
+      trainable=trainable, name=name, **kwargs)
     self._embedding_shape = embedding_shape
     self._initializer = initializer
     self._weight_collections = weight_collections
@@ -1741,11 +1726,11 @@ def set_weight_collections(self, weight_collections):
 
   def build(self, _):
     self._embedding_weight_var = self.add_variable(
-        name='embedding_weights',
-        shape=self._embedding_shape,
-        dtype=dtypes.float32,
-        initializer=self._initializer,
-        trainable=self.trainable)
+      name='embedding_weights',
+      shape=self._embedding_shape,
+      dtype=dtypes.float32,
+      initializer=self._initializer,
+      trainable=self.trainable)
     if self._weight_collections and not context.executing_eagerly():
       _add_to_collections(self._embedding_weight_var, self._weight_collections)
     self.built = True
@@ -1891,21 +1876,21 @@ def _create_weighted_sum(column,
   """Creates a weighted sum for a dense/categorical column for linear_model."""
   if isinstance(column, _CategoricalColumn):
     return _create_categorical_column_weighted_sum(
-        column=column,
-        builder=builder,
-        units=units,
-        sparse_combiner=sparse_combiner,
-        weight_collections=weight_collections,
-        trainable=trainable,
-        weight_var=weight_var)
+      column=column,
+      builder=builder,
+      units=units,
+      sparse_combiner=sparse_combiner,
+      weight_collections=weight_collections,
+      trainable=trainable,
+      weight_var=weight_var)
   else:
     return _create_dense_column_weighted_sum(
-        column=column,
-        builder=builder,
-        units=units,
-        weight_collections=weight_collections,
-        trainable=trainable,
-        weight_var=weight_var)
+      column=column,
+      builder=builder,
+      units=units,
+      weight_collections=weight_collections,
+      trainable=trainable,
+      weight_var=weight_var)
 
 
 def _create_dense_column_weighted_sum(column,
@@ -1916,9 +1901,9 @@ def _create_dense_column_weighted_sum(column,
                                       weight_var=None):
   """Create a weighted sum of a dense column for linear_model."""
   tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-      builder,
-      weight_collections=weight_collections,
-      trainable=trainable)
+    builder,
+    weight_collections=weight_collections,
+    trainable=trainable)
   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
@@ -1926,11 +1911,11 @@ def _create_dense_column_weighted_sum(column,
     weight = weight_var
   else:
     weight = variable_scope.get_variable(
-        name='weights',
-        shape=[num_elements, units],
-        initializer=init_ops.zeros_initializer(),
-        trainable=trainable,
-        collections=weight_collections)
+      name='weights',
+      shape=[num_elements, units],
+      initializer=init_ops.zeros_initializer(),
+      trainable=trainable,
+      collections=weight_collections)
   return math_ops.matmul(tensor, weight, name='weighted_sum')
 
 
@@ -1944,7 +1929,7 @@ class _CategoricalColumn(_FeatureColumn):
   """
 
   IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
-      'IdWeightPair', ['id_tensor', 'weight_tensor'])
+    'IdWeightPair', ['id_tensor', 'weight_tensor'])
 
   @abc.abstractproperty
   def _num_buckets(self):
@@ -2014,39 +1999,39 @@ def _create_categorical_column_weighted_sum(column,
   sparse_combiner = "sum".
   """
   sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
-      builder,
-      weight_collections=weight_collections,
-      trainable=trainable)
+    builder,
+    weight_collections=weight_collections,
+    trainable=trainable)
   id_tensor = sparse_ops.sparse_reshape(
-      sparse_tensors.id_tensor,
-      [array_ops.shape(sparse_tensors.id_tensor)[0], -1])
+    sparse_tensors.id_tensor,
+    [array_ops.shape(sparse_tensors.id_tensor)[0], -1])
   weight_tensor = sparse_tensors.weight_tensor
   if weight_tensor is not None:
     weight_tensor = sparse_ops.sparse_reshape(
-        weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
+      weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
   if weight_var is not None:
     weight = weight_var
   else:
     weight = variable_scope.get_variable(
-        name='weights',
-        shape=(column._num_buckets, units),  # pylint: disable=protected-access
-        initializer=init_ops.zeros_initializer(),
-        trainable=trainable,
-        collections=weight_collections)
+      name='weights',
+      shape=(column._num_buckets, units),  # pylint: disable=protected-access
+      initializer=init_ops.zeros_initializer(),
+      trainable=trainable,
+      collections=weight_collections)
   return embedding_ops.safe_embedding_lookup_sparse(
-      weight,
-      id_tensor,
-      sparse_weights=weight_tensor,
-      combiner=sparse_combiner,
-      name='weighted_sum')
+    weight,
+    id_tensor,
+    sparse_weights=weight_tensor,
+    combiner=sparse_combiner,
+    name='weighted_sum')
 
 
 class _SequenceDenseColumn(_FeatureColumn):
   """Represents dense sequence data."""
 
   TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
-      'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
+    'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
 
   @abc.abstractmethod
   def _get_sequence_dense_tensor(self,
@@ -2162,7 +2147,7 @@ def _get_raw_feature_as_tensor(self, key):
     """
     raw_feature = self._features[key]
     feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
-        raw_feature)
+      raw_feature)
 
     def expand_dims(input_tensor):
       # Input_tensor must have rank 1.
@@ -2176,20 +2161,20 @@ def expand_dims(input_tensor):
     if rank is not None:
       if rank == 0:
         raise ValueError(
-            'Feature (key: {}) cannot have rank 0. Give: {}'.format(
-                key, feature_tensor))
+          'Feature (key: {}) cannot have rank 0. Give: {}'.format(
+            key, feature_tensor))
       return feature_tensor if rank != 1 else expand_dims(feature_tensor)
 
     # Handle dynamic rank.
     with ops.control_dependencies([
-        check_ops.assert_positive(
-            array_ops.rank(feature_tensor),
-            message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
-                key, feature_tensor))
+      check_ops.assert_positive(
+        array_ops.rank(feature_tensor),
+        message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
+          key, feature_tensor))
     ]):
       return control_flow_ops.cond(
-          math_ops.equal(1, array_ops.rank(feature_tensor)),
-          lambda: expand_dims(feature_tensor), lambda: feature_tensor)
+        math_ops.equal(1, array_ops.rank(feature_tensor)),
+        lambda: expand_dims(feature_tensor), lambda: feature_tensor)
 
 
 # TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
@@ -2224,7 +2209,7 @@ def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):
     ValueError: when `input_tensor`'s rank is `None`.
   """
   input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
-      input_tensor)
+    input_tensor)
   if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
     return input_tensor
   with ops.name_scope(None, 'to_sparse_input', (
@@ -2243,14 +2228,14 @@ def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):
         # default value for that type.
         ignore_value = input_tensor.dtype.as_numpy_dtype()
     ignore_value = math_ops.cast(
-        ignore_value, input_tensor.dtype, name='ignore_value')
+      ignore_value, input_tensor.dtype, name='ignore_value')
     indices = array_ops.where(
-        math_ops.not_equal(input_tensor, ignore_value), name='indices')
+      math_ops.not_equal(input_tensor, ignore_value), name='indices')
     return sparse_tensor_lib.SparseTensor(
-        indices=indices,
-        values=array_ops.gather_nd(input_tensor, indices, name='values'),
-        dense_shape=array_ops.shape(
-            input_tensor, out_type=dtypes.int64, name='dense_shape'))
+      indices=indices,
+      values=array_ops.gather_nd(input_tensor, indices, name='values'),
+      dense_shape=array_ops.shape(
+        input_tensor, out_type=dtypes.int64, name='dense_shape'))
 
 
 def _normalize_feature_columns(feature_columns):
@@ -2299,10 +2284,10 @@ def _normalize_feature_columns(feature_columns):
 
 
 class _NumericColumn(
-    _DenseColumn,
-    collections.namedtuple(
-        '_NumericColumn',
-        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
+  _DenseColumn,
+  collections.namedtuple(
+    '_NumericColumn',
+    ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
   """see `numeric_column`."""
 
   @property
@@ -2312,17 +2297,17 @@ def name(self):
   @property
   def _parse_example_spec(self):
     return {
-        self.key:
-            parsing_ops.FixedLenFeature(self.shape, self.dtype,
-                                        self.default_value)
+      self.key:
+        parsing_ops.FixedLenFeature(self.shape, self.dtype,
+                                    self.default_value)
     }
 
   def _transform_feature(self, inputs):
     input_tensor = inputs.get(self.key)
     if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
       raise ValueError(
-          'The corresponding Tensor of numerical column must be a Tensor. '
-          'SparseTensor is not supported. key: {}'.format(self.key))
+        'The corresponding Tensor of numerical column must be a Tensor. '
+        'SparseTensor is not supported. key: {}'.format(self.key))
     if self.normalizer_fn is not None:
       input_tensor = self.normalizer_fn(input_tensor)
     return math_ops.cast(input_tensor, dtypes.float32)
@@ -2374,23 +2359,23 @@ def _parse_example_spec(self):
   def _transform_feature(self, inputs):
     source_tensor = inputs.get(self.source_column)
     return math_ops._bucketize(  # pylint: disable=protected-access
-        source_tensor,
-        boundaries=self.boundaries)
+      source_tensor,
+      boundaries=self.boundaries)
 
   @property
   def _variable_shape(self):
     return tensor_shape.TensorShape(
-        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
+      tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     del weight_collections
     del trainable
     input_tensor = inputs.get(self)
     return array_ops.one_hot(
-        indices=math_ops.cast(input_tensor, dtypes.int64),
-        depth=len(self.boundaries) + 1,
-        on_value=1.,
-        off_value=0.)
+      indices=math_ops.cast(input_tensor, dtypes.int64),
+      depth=len(self.boundaries) + 1,
+      on_value=1.,
+      off_value=0.)
 
   @property
   def _num_buckets(self):
@@ -2408,9 +2393,9 @@ def _get_sparse_tensors(self,
     source_dimension = self.source_column.shape[0]
 
     i1 = array_ops.reshape(
-        array_ops.tile(
-            array_ops.expand_dims(math_ops.range(0, batch_size), 1),
-            [1, source_dimension]), (-1,))
+      array_ops.tile(
+        array_ops.expand_dims(math_ops.range(0, batch_size), 1),
+        [1, source_dimension]), (-1,))
     i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
     # Flatten the bucket indices and unique them across dimensions
     # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
@@ -2419,20 +2404,20 @@ def _get_sparse_tensors(self,
                           (-1,)) + (len(self.boundaries) + 1) * i2)
 
     indices = math_ops.cast(
-        array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
+      array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
     dense_shape = math_ops.cast(
-        array_ops.stack([batch_size, source_dimension]), dtypes.int64)
+      array_ops.stack([batch_size, source_dimension]), dtypes.int64)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
-        indices=indices, values=bucket_indices, dense_shape=dense_shape)
+      indices=indices, values=bucket_indices, dense_shape=dense_shape)
     return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
 
 
 class _EmbeddingColumn(
-    _DenseColumn, _SequenceDenseColumn,
-    collections.namedtuple(
-        '_EmbeddingColumn',
-        ('categorical_column', 'dimension', 'combiner', 'layer_creator',
-         'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
+  _DenseColumn, _SequenceDenseColumn,
+  collections.namedtuple(
+    '_EmbeddingColumn',
+    ('categorical_column', 'dimension', 'combiner', 'layer_creator',
+     'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
   """See `embedding_column`."""
 
   @property
@@ -2461,47 +2446,47 @@ def _get_dense_tensor_internal(self,
     """Private method that follows the signature of _get_dense_tensor."""
     # Get sparse IDs and weights.
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
-        inputs,
-        weight_collections=weight_collections,
-        trainable=trainable)
+      inputs,
+      weight_collections=weight_collections,
+      trainable=trainable)
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
     embedding_weights = self.layer_creator(
-        weight_collections=weight_collections,
-        scope=variable_scope.get_variable_scope())
+      weight_collections=weight_collections,
+      scope=variable_scope.get_variable_scope())
 
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
       if isinstance(to_restore, variables.PartitionedVariable):
         to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
       checkpoint_utils.init_from_checkpoint(
-          self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
+        self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
 
     # Return embedding lookup result.
     return embedding_ops.safe_embedding_lookup_sparse(
-        embedding_weights=embedding_weights,
-        sparse_ids=sparse_ids,
-        sparse_weights=sparse_weights,
-        combiner=self.combiner,
-        name='%s_weights' % self.name,
-        max_norm=self.max_norm)
+      embedding_weights=embedding_weights,
+      sparse_ids=sparse_ids,
+      sparse_weights=sparse_weights,
+      combiner=self.combiner,
+      name='%s_weights' % self.name,
+      max_norm=self.max_norm)
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-          'In embedding_column: {}. '
-          'categorical_column must not be of type _SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
-          'non-sequence categorical_column_with_*. '
-          'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
-          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                       self.categorical_column))
+        'In embedding_column: {}. '
+        'categorical_column must not be of type _SequenceCategoricalColumn. '
+        'Suggested fix A: If you wish to use input_layer, use a '
+        'non-sequence categorical_column_with_*. '
+        'Suggested fix B: If you wish to create sequence input, use '
+        'sequence_input_layer instead of input_layer. '
+        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                     self.categorical_column))
     return self._get_dense_tensor_internal(
-        inputs=inputs,
-        weight_collections=weight_collections,
-        trainable=trainable)
+      inputs=inputs,
+      weight_collections=weight_collections,
+      trainable=trainable)
 
   def _get_sequence_dense_tensor(self,
                                  inputs,
@@ -2509,22 +2494,22 @@ def _get_sequence_dense_tensor(self,
                                  trainable=None):
     if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-          'In embedding_column: {}. '
-          'categorical_column must be of type _SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
-          'Suggested fix: Use one of sequence_categorical_column_with_*. '
-          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                       self.categorical_column))
+        'In embedding_column: {}. '
+        'categorical_column must be of type _SequenceCategoricalColumn '
+        'to use sequence_input_layer. '
+        'Suggested fix: Use one of sequence_categorical_column_with_*. '
+        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                     self.categorical_column))
     dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
-        inputs=inputs,
-        weight_collections=weight_collections,
-        trainable=trainable)
+      inputs=inputs,
+      weight_collections=weight_collections,
+      trainable=trainable)
 
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     sequence_length = fc_utils.sequence_length_from_sparse_tensor(
-        sparse_tensors.id_tensor)
+      sparse_tensors.id_tensor)
     return _SequenceDenseColumn.TensorSequenceLengthPair(
-        dense_tensor=dense_tensor, sequence_length=sequence_length)
+      dense_tensor=dense_tensor, sequence_length=sequence_length)
 
 
 def _get_graph_for_variable(var):
@@ -2535,13 +2520,13 @@ def _get_graph_for_variable(var):
 
 
 class _SharedEmbeddingColumn(
-    _DenseColumn, _SequenceDenseColumn,
-    collections.namedtuple(
-        '_SharedEmbeddingColumn',
-        ('categorical_column', 'dimension', 'combiner', 'initializer',
-         'shared_embedding_collection_name', 'ckpt_to_load_from',
-         'tensor_name_in_ckpt', 'max_norm', 'trainable', 'partitioner',
-         'ev_params'))):
+  _DenseColumn, _SequenceDenseColumn,
+  collections.namedtuple(
+    '_SharedEmbeddingColumn',
+    ('categorical_column', 'dimension', 'combiner', 'initializer',
+     'shared_embedding_collection_name', 'ckpt_to_load_from',
+     'tensor_name_in_ckpt', 'max_norm', 'trainable', 'partitioner',
+     'ev_params'))):
   """See `embedding_column`."""
 
   @property
@@ -2556,9 +2541,9 @@ def raw_name(self):
 
   @property
   def cardinality(self):
-    from easy_rec.python.compat.feature_column.feature_column_v2 import HashedCategoricalColumn,\
+    from easy_rec.python.compat.feature_column.feature_column_v2 import HashedCategoricalColumn, \
       BucketizedColumn, WeightedCategoricalColumn, SequenceWeightedCategoricalColumn, \
-      CrossedColumn, IdentityCategoricalColumn, VocabularyListCategoricalColumn,\
+      CrossedColumn, IdentityCategoricalColumn, VocabularyListCategoricalColumn, \
       VocabularyFileCategoricalColumn
 
     fc = self.categorical_column
@@ -2621,66 +2606,66 @@ def _get_dense_tensor_internal(self,
     with ops.name_scope(None, default_name=self.name):
       # Get sparse IDs and weights.
       sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
-          inputs,
-          weight_collections=weight_collections,
-          trainable=trainable)
+        inputs,
+        weight_collections=weight_collections,
+        trainable=trainable)
       sparse_ids = sparse_tensors.id_tensor
       sparse_weights = sparse_tensors.weight_tensor
 
       embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
       shared_embedding_collection = ops.get_collection(
-          self.shared_embedding_collection_name)
+        self.shared_embedding_collection_name)
       if shared_embedding_collection:
         if len(shared_embedding_collection) > 1:
           raise ValueError(
-              'Collection {} can only contain one variable. '
-              'Suggested fix A: Choose a unique name for this collection. '
-              'Suggested fix B: Do not add any variables to this collection. '
-              'The feature_column library already adds a variable under the '
-              'hood.'.format(shared_embedding_collection))
+            'Collection {} can only contain one variable. '
+            'Suggested fix A: Choose a unique name for this collection. '
+            'Suggested fix B: Do not add any variables to this collection. '
+            'The feature_column library already adds a variable under the '
+            'hood.'.format(shared_embedding_collection))
         embedding_weights = shared_embedding_collection[0]
         if embedding_weights.get_shape(
         ) != embedding_shape and not self.ev_params is not None:  # noqa : E714
           raise ValueError(
-              'Shared embedding collection {} contains variable {} of '
-              'unexpected shape {}. Expected shape is {}. '
-              'Suggested fix A: Choose a unique name for this collection. '
-              'Suggested fix B: Do not add any variables to this collection. '
-              'The feature_column library already adds a variable under the '
-              'hood.'.format(self.shared_embedding_collection_name,
-                             embedding_weights.name,
-                             embedding_weights.get_shape(), embedding_shape))
+            'Shared embedding collection {} contains variable {} of '
+            'unexpected shape {}. Expected shape is {}. '
+            'Suggested fix A: Choose a unique name for this collection. '
+            'Suggested fix B: Do not add any variables to this collection. '
+            'The feature_column library already adds a variable under the '
+            'hood.'.format(self.shared_embedding_collection_name,
+                           embedding_weights.name,
+                           embedding_weights.get_shape(), embedding_shape))
       else:
         if self.ev_params is None:
           embedding_weights = variable_scope.get_variable(
-              name='embedding_weights',
-              shape=embedding_shape,
-              dtype=dtypes.float32,
-              initializer=self.initializer,
-              trainable=self.trainable and trainable,
-              partitioner=self.partitioner,
-              collections=weight_collections)
+            name='embedding_weights',
+            shape=embedding_shape,
+            dtype=dtypes.float32,
+            initializer=self.initializer,
+            trainable=self.trainable and trainable,
+            partitioner=self.partitioner,
+            collections=weight_collections)
         else:
           # at eval or inference time, it is necessary to set
           # the initializers to zeros, so that new key will
           # get zero embedding
           import os
           if os.environ.get('tf.estimator.mode', '') != \
-             os.environ.get('tf.estimator.ModeKeys.TRAIN', 'train'):
+              os.environ.get('tf.estimator.ModeKeys.TRAIN', 'train'):
             initializer = init_ops.zeros_initializer()
           else:
             initializer = self.initializer
           embedding_weights = variable_scope.get_embedding_variable(
-              name='embedding_weights',
-              embedding_dim=self.dimension,
-              initializer=initializer,
-              trainable=self.trainable and trainable,
-              partitioner=self.partitioner,
-              collections=weight_collections,
-              steps_to_live=self.ev_params.steps_to_live
-              if self.ev_params is not None else None,
-              filter_options=variables.CounterFilterOptions(
-                  self.ev_params.filter_freq))
+            name='embedding_weights',
+            embedding_dim=self.dimension,
+            initializer=initializer,
+            trainable=self.trainable and trainable,
+            partitioner=self.partitioner,
+            collections=weight_collections,
+            steps_to_live=self.ev_params.steps_to_live
+            if self.ev_params is not None else None,
+            filter_options=variables.CounterFilterOptions(
+              self.ev_params.filter_freq))
 
         ops.add_to_collection(self.shared_embedding_collection_name,
                               embedding_weights)
@@ -2689,41 +2674,41 @@ def _get_dense_tensor_internal(self,
         if isinstance(to_restore, variables.PartitionedVariable):
           to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
         checkpoint_utils.init_from_checkpoint(
-            self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
+          self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
 
       # Return embedding lookup result.
       if self.ev_params is not None:
         return ev_embedding_ops.safe_embedding_lookup_sparse(
-            embedding_weights=embedding_weights,
-            sparse_ids=sparse_ids,
-            sparse_weights=sparse_weights,
-            combiner=self.combiner,
-            name='%s_weights' % self.name,
-            max_norm=self.max_norm)
+          embedding_weights=embedding_weights,
+          sparse_ids=sparse_ids,
+          sparse_weights=sparse_weights,
+          combiner=self.combiner,
+          name='%s_weights' % self.name,
+          max_norm=self.max_norm)
       else:
         return embedding_ops.safe_embedding_lookup_sparse(
-            embedding_weights=embedding_weights,
-            sparse_ids=sparse_ids,
-            sparse_weights=sparse_weights,
-            combiner=self.combiner,
-            name='%s_weights' % self.name,
-            max_norm=self.max_norm)
+          embedding_weights=embedding_weights,
+          sparse_ids=sparse_ids,
+          sparse_weights=sparse_weights,
+          combiner=self.combiner,
+          name='%s_weights' % self.name,
+          max_norm=self.max_norm)
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-          'In embedding_column: {}. '
-          'categorical_column must not be of type _SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
-          'non-sequence categorical_column_with_*. '
-          'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
-          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                       self.categorical_column))
+        'In embedding_column: {}. '
+        'categorical_column must not be of type _SequenceCategoricalColumn. '
+        'Suggested fix A: If you wish to use input_layer, use a '
+        'non-sequence categorical_column_with_*. '
+        'Suggested fix B: If you wish to create sequence input, use '
+        'sequence_input_layer instead of input_layer. '
+        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                     self.categorical_column))
     return self._get_dense_tensor_internal(
-        inputs=inputs,
-        weight_collections=weight_collections,
-        trainable=trainable)
+      inputs=inputs,
+      weight_collections=weight_collections,
+      trainable=trainable)
 
   def _get_sequence_dense_tensor(self,
                                  inputs,
@@ -2731,21 +2716,21 @@ def _get_sequence_dense_tensor(self,
                                  trainable=None):
     if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-          'In embedding_column: {}. '
-          'categorical_column must be of type _SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
-          'Suggested fix: Use one of sequence_categorical_column_with_*. '
-          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                       self.categorical_column))
+        'In embedding_column: {}. '
+        'categorical_column must be of type _SequenceCategoricalColumn '
+        'to use sequence_input_layer. '
+        'Suggested fix: Use one of sequence_categorical_column_with_*. '
+        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                     self.categorical_column))
     dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
-        inputs=inputs,
-        weight_collections=weight_collections,
-        trainable=trainable)
+      inputs=inputs,
+      weight_collections=weight_collections,
+      trainable=trainable)
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     sequence_length = fc_utils.sequence_length_from_sparse_tensor(
-        sparse_tensors.id_tensor)
+      sparse_tensors.id_tensor)
     return _SequenceDenseColumn.TensorSequenceLengthPair(
-        dense_tensor=dense_tensor, sequence_length=sequence_length)
+      dense_tensor=dense_tensor, sequence_length=sequence_length)
 
 
 def _check_shape(shape, key):
@@ -2766,8 +2751,8 @@ def _check_shape(shape, key):
 
 class _HashedCategoricalColumn(_CategoricalColumn,
                                collections.namedtuple(
-                                   '_HashedCategoricalColumn',
-                                   ['key', 'hash_bucket_size', 'dtype'])):
+                                 '_HashedCategoricalColumn',
+                                 ['key', 'hash_bucket_size', 'dtype'])):
   """see `categorical_column_with_hash_bucket`."""
 
   @property
@@ -2788,14 +2773,14 @@ def _transform_feature(self, inputs):
       raise ValueError('SparseColumn input must be a SparseTensor.')
 
     fc_utils.assert_string_or_int(
-        input_tensor.dtype,
-        prefix='column_name: {} input_tensor'.format(self.key))
+      input_tensor.dtype,
+      prefix='column_name: {} input_tensor'.format(self.key))
 
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
-          'Column dtype and SparseTensors dtype must be compatible. '
-          'key: {}, column dtype: {}, tensor dtype: {}'.format(
-              self.key, self.dtype, input_tensor.dtype))
+        'Column dtype and SparseTensors dtype must be compatible. '
+        'key: {}, column dtype: {}, tensor dtype: {}'.format(
+          self.key, self.dtype, input_tensor.dtype))
 
     if self.dtype == dtypes.string:
       sparse_values = input_tensor.values
@@ -2803,7 +2788,7 @@ def _transform_feature(self, inputs):
       sparse_values = string_ops.as_string(input_tensor.values)
 
     sparse_id_values = string_ops.string_to_hash_bucket_fast(
-        sparse_values, self.hash_bucket_size, name='lookup')
+      sparse_values, self.hash_bucket_size, name='lookup')
     return sparse_tensor_lib.SparseTensor(input_tensor.indices,
                                           sparse_id_values,
                                           input_tensor.dense_shape)
@@ -2821,10 +2806,10 @@ def _get_sparse_tensors(self,
 
 
 class _VocabularyFileCategoricalColumn(
-    _CategoricalColumn,
-    collections.namedtuple('_VocabularyFileCategoricalColumn',
-                           ('key', 'vocabulary_file', 'vocabulary_size',
-                            'num_oov_buckets', 'dtype', 'default_value'))):
+  _CategoricalColumn,
+  collections.namedtuple('_VocabularyFileCategoricalColumn',
+                         ('key', 'vocabulary_file', 'vocabulary_size',
+                          'num_oov_buckets', 'dtype', 'default_value'))):
   """See `categorical_column_with_vocabulary_file`."""
 
   @property
@@ -2840,13 +2825,13 @@ def _transform_feature(self, inputs):
 
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
-          'Column dtype and SparseTensors dtype must be compatible. '
-          'key: {}, column dtype: {}, tensor dtype: {}'.format(
-              self.key, self.dtype, input_tensor.dtype))
+        'Column dtype and SparseTensors dtype must be compatible. '
+        'key: {}, column dtype: {}, tensor dtype: {}'.format(
+          self.key, self.dtype, input_tensor.dtype))
 
     fc_utils.assert_string_or_int(
-        input_tensor.dtype,
-        prefix='column_name: {} input_tensor'.format(self.key))
+      input_tensor.dtype,
+      prefix='column_name: {} input_tensor'.format(self.key))
 
     key_dtype = self.dtype
     if input_tensor.dtype.is_integer:
@@ -2855,12 +2840,12 @@ def _transform_feature(self, inputs):
       input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     return lookup_ops.index_table_from_file(
-        vocabulary_file=self.vocabulary_file,
-        num_oov_buckets=self.num_oov_buckets,
-        vocab_size=self.vocabulary_size,
-        default_value=self.default_value,
-        key_dtype=key_dtype,
-        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+      vocabulary_file=self.vocabulary_file,
+      num_oov_buckets=self.num_oov_buckets,
+      vocab_size=self.vocabulary_size,
+      default_value=self.default_value,
+      key_dtype=key_dtype,
+      name='{}_lookup'.format(self.key)).lookup(input_tensor)
 
   @property
   def _num_buckets(self):
@@ -2875,10 +2860,10 @@ def _get_sparse_tensors(self,
 
 
 class _VocabularyListCategoricalColumn(
-    _CategoricalColumn,
-    collections.namedtuple(
-        '_VocabularyListCategoricalColumn',
-        ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'))
+  _CategoricalColumn,
+  collections.namedtuple(
+    '_VocabularyListCategoricalColumn',
+    ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'))
 ):
   """See `categorical_column_with_vocabulary_list`."""
 
@@ -2895,13 +2880,13 @@ def _transform_feature(self, inputs):
 
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
-          'Column dtype and SparseTensors dtype must be compatible. '
-          'key: {}, column dtype: {}, tensor dtype: {}'.format(
-              self.key, self.dtype, input_tensor.dtype))
+        'Column dtype and SparseTensors dtype must be compatible. '
+        'key: {}, column dtype: {}, tensor dtype: {}'.format(
+          self.key, self.dtype, input_tensor.dtype))
 
     fc_utils.assert_string_or_int(
-        input_tensor.dtype,
-        prefix='column_name: {} input_tensor'.format(self.key))
+      input_tensor.dtype,
+      prefix='column_name: {} input_tensor'.format(self.key))
 
     key_dtype = self.dtype
     if input_tensor.dtype.is_integer:
@@ -2910,11 +2895,11 @@ def _transform_feature(self, inputs):
       input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     return lookup_ops.index_table_from_tensor(
-        vocabulary_list=tuple(self.vocabulary_list),
-        default_value=self.default_value,
-        num_oov_buckets=self.num_oov_buckets,
-        dtype=key_dtype,
-        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+      vocabulary_list=tuple(self.vocabulary_list),
+      default_value=self.default_value,
+      num_oov_buckets=self.num_oov_buckets,
+      dtype=key_dtype,
+      name='{}_lookup'.format(self.key)).lookup(input_tensor)
 
   @property
   def _num_buckets(self):
@@ -2930,8 +2915,8 @@ def _get_sparse_tensors(self,
 
 class _IdentityCategoricalColumn(_CategoricalColumn,
                                  collections.namedtuple(
-                                     '_IdentityCategoricalColumn',
-                                     ('key', 'num_buckets', 'default_value'))):
+                                   '_IdentityCategoricalColumn',
+                                   ('key', 'num_buckets', 'default_value'))):
   """See `categorical_column_with_identity`."""
 
   @property
@@ -2947,37 +2932,37 @@ def _transform_feature(self, inputs):
 
     if not input_tensor.dtype.is_integer:
       raise ValueError('Invalid input, not integer. key: {} dtype: {}'.format(
-          self.key, input_tensor.dtype))
+        self.key, input_tensor.dtype))
 
     values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
     num_buckets = math_ops.cast(
-        self.num_buckets, dtypes.int64, name='num_buckets')
+      self.num_buckets, dtypes.int64, name='num_buckets')
     zero = math_ops.cast(0, dtypes.int64, name='zero')
     if self.default_value is None:
       # Fail if values are out-of-range.
       assert_less = check_ops.assert_less(
-          values,
-          num_buckets,
-          data=(values, num_buckets),
-          name='assert_less_than_num_buckets')
+        values,
+        num_buckets,
+        data=(values, num_buckets),
+        name='assert_less_than_num_buckets')
       assert_greater = check_ops.assert_greater_equal(
-          values, zero, data=(values,), name='assert_greater_or_equal_0')
+        values, zero, data=(values,), name='assert_greater_or_equal_0')
       with ops.control_dependencies((assert_less, assert_greater)):
         values = array_ops.identity(values)
     else:
       # Assign default for out-of-range values.
       values = array_ops.where(
-          math_ops.logical_or(
-              values < zero, values >= num_buckets, name='out_of_range'),
-          array_ops.fill(
-              dims=array_ops.shape(values),
-              value=math_ops.cast(self.default_value, dtypes.int64),
-              name='default_values'), values)
+        math_ops.logical_or(
+          values < zero, values >= num_buckets, name='out_of_range'),
+        array_ops.fill(
+          dims=array_ops.shape(values),
+          value=math_ops.cast(self.default_value, dtypes.int64),
+          name='default_values'), values)
 
     return sparse_tensor_lib.SparseTensor(
-        indices=input_tensor.indices,
-        values=values,
-        dense_shape=input_tensor.dense_shape)
+      indices=input_tensor.indices,
+      values=values,
+      dense_shape=input_tensor.dense_shape)
 
   @property
   def _num_buckets(self):
@@ -2992,10 +2977,10 @@ def _get_sparse_tensors(self,
 
 
 class _WeightedCategoricalColumn(
-    _CategoricalColumn,
-    collections.namedtuple(
-        '_WeightedCategoricalColumn',
-        ('categorical_column', 'weight_feature_key', 'dtype'))):
+  _CategoricalColumn,
+  collections.namedtuple(
+    '_WeightedCategoricalColumn',
+    ('categorical_column', 'weight_feature_key', 'dtype'))):
   """See `weighted_categorical_column`."""
 
   @property
@@ -3008,7 +2993,7 @@ def _parse_example_spec(self):
     config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
     if self.weight_feature_key in config:
       raise ValueError('Parse config {} already exists for {}.'.format(
-          config[self.weight_feature_key], self.weight_feature_key))
+        config[self.weight_feature_key], self.weight_feature_key))
     config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
     return config
 
@@ -3021,14 +3006,14 @@ def _transform_feature(self, inputs):
     if weight_tensor is None:
       raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
     weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
-        weight_tensor)
+      weight_tensor)
     if self.dtype != weight_tensor.dtype.base_dtype:
       raise ValueError('Bad dtype, expected {}, but got {}.'.format(
-          self.dtype, weight_tensor.dtype))
+        self.dtype, weight_tensor.dtype))
     if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
       # The weight tensor can be a regular Tensor. In this case, sparsify it.
       weight_tensor = _to_sparse_input_and_drop_ignore_values(
-          weight_tensor, ignore_value=0.0)
+        weight_tensor, ignore_value=0.0)
     if not weight_tensor.dtype.is_floating:
       weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
     return (inputs.get(self.categorical_column), weight_tensor)
@@ -3044,9 +3029,9 @@ def _get_sparse_tensors(self,
 
 
 class _CrossedColumn(
-    _CategoricalColumn,
-    collections.namedtuple('_CrossedColumn',
-                           ['keys', 'hash_bucket_size', 'hash_key'])):
+  _CategoricalColumn,
+  collections.namedtuple('_CrossedColumn',
+                         ['keys', 'hash_bucket_size', 'hash_key'])):
   """See `crossed_column`."""
 
   @property
@@ -3078,16 +3063,16 @@ def _transform_feature(self, inputs):
         ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
         if ids_and_weights.weight_tensor is not None:
           raise ValueError(
-              'crossed_column does not support weight_tensor, but the given '
-              'column populates weight_tensor. '
-              'Given column: {}'.format(key.name))
+            'crossed_column does not support weight_tensor, but the given '
+            'column populates weight_tensor. '
+            'Given column: {}'.format(key.name))
         feature_tensors.append(ids_and_weights.id_tensor)
       else:
         raise ValueError('Unsupported column type. Given: {}'.format(key))
     return sparse_ops.sparse_cross_hashed(
-        inputs=feature_tensors,
-        num_buckets=self.hash_bucket_size,
-        hash_key=self.hash_key)
+      inputs=feature_tensors,
+      num_buckets=self.hash_bucket_size,
+      hash_key=self.hash_key)
 
   @property
   def _num_buckets(self):
@@ -3152,9 +3137,9 @@ def _transform_feature(self, inputs):
     # If the underlying column is weighted, return the input as a dense tensor.
     if weight_tensor is not None:
       weighted_column = sparse_ops.sparse_merge(
-          sp_ids=id_tensor,
-          sp_values=weight_tensor,
-          vocab_size=int(self._variable_shape[-1]))
+        sp_ids=id_tensor,
+        sp_values=weight_tensor,
+        vocab_size=int(self._variable_shape[-1]))
       # Remove (?, -1) index.
       weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
                                                 weighted_column.dense_shape)
@@ -3165,15 +3150,15 @@ def _transform_feature(self, inputs):
                                   weighted_column.dense_shape)
 
     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
-        id_tensor, default_value=-1)
+      id_tensor, default_value=-1)
 
     # One hot must be float for tf.concat reasons since all other inputs to
     # input_layer are float32.
     one_hot_id_tensor = array_ops.one_hot(
-        dense_id_tensor,
-        depth=self._variable_shape[-1],
-        on_value=1.0,
-        off_value=0.0)
+      dense_id_tensor,
+      depth=self._variable_shape[-1],
+      on_value=1.0,
+      off_value=0.0)
 
     # Reduce to get a multi-hot per example.
     return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
@@ -3209,14 +3194,14 @@ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     del trainable
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-          'In indicator_column: {}. '
-          'categorical_column must not be of type _SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
-          'non-sequence categorical_column_with_*. '
-          'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
-          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                       self.categorical_column))
+        'In indicator_column: {}. '
+        'categorical_column must not be of type _SequenceCategoricalColumn. '
+        'Suggested fix A: If you wish to use input_layer, use a '
+        'non-sequence categorical_column_with_*. '
+        'Suggested fix B: If you wish to create sequence input, use '
+        'sequence_input_layer instead of input_layer. '
+        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                     self.categorical_column))
     # Feature has been already transformed. Return the intermediate
     # representation created by _transform_feature.
     return inputs.get(self)
@@ -3231,20 +3216,20 @@ def _get_sequence_dense_tensor(self,
     del trainable
     if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-          'In indicator_column: {}. '
-          'categorical_column must be of type _SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
-          'Suggested fix: Use one of sequence_categorical_column_with_*. '
-          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                       self.categorical_column))
+        'In indicator_column: {}. '
+        'categorical_column must be of type _SequenceCategoricalColumn '
+        'to use sequence_input_layer. '
+        'Suggested fix: Use one of sequence_categorical_column_with_*. '
+        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                     self.categorical_column))
     # Feature has been already transformed. Return the intermediate
     # representation created by _transform_feature.
     dense_tensor = inputs.get(self)
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     sequence_length = fc_utils.sequence_length_from_sparse_tensor(
-        sparse_tensors.id_tensor)
+      sparse_tensors.id_tensor)
     return _SequenceDenseColumn.TensorSequenceLengthPair(
-        dense_tensor=dense_tensor, sequence_length=sequence_length)
+      dense_tensor=dense_tensor, sequence_length=sequence_length)
 
 
 def _verify_static_batch_size_equality(tensors, columns):
@@ -3267,16 +3252,16 @@ def _verify_static_batch_size_equality(tensors, columns):
         expected_batch_size = tensors[i].shape.dims[0]
       elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]):
         raise ValueError(
-            'Batch size (first dimension) of each feature must be same. '
-            'Batch size of columns ({}, {}): ({}, {})'.format(
-                columns[bath_size_column_index].name, columns[i].name,
-                expected_batch_size, tensors[i].shape.dims[0]))
+          'Batch size (first dimension) of each feature must be same. '
+          'Batch size of columns ({}, {}): ({}, {})'.format(
+            columns[bath_size_column_index].name, columns[i].name,
+            expected_batch_size, tensors[i].shape.dims[0]))
 
 
 class _SequenceCategoricalColumn(_CategoricalColumn,
                                  collections.namedtuple(
-                                     '_SequenceCategoricalColumn',
-                                     ['categorical_column'])):
+                                   '_SequenceCategoricalColumn',
+                                   ['categorical_column'])):
   """Represents sequences of categorical data."""
 
   @property
diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py
index c264c30c2..0ca532bea 100644
--- a/easy_rec/python/compat/feature_column/feature_column_v2.py
+++ b/easy_rec/python/compat/feature_column/feature_column_v2.py
@@ -5451,3 +5451,13 @@ def deserialize_feature_columns(configs, custom_objects=None):
       deserialize_feature_column(c, custom_objects, columns_by_name)
       for c in configs
   ]
+
+
+def is_embedding_column(fc):
+  if isinstance(fc, EmbeddingColumn):
+    return True
+  if isinstance(fc, fc_old._SharedEmbeddingColumn):
+    return True
+  if isinstance(fc, SharedEmbeddingColumn):
+    return True
+  return False
diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index d2325e680..5cdaa1dd1 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -6,6 +6,7 @@
 
 import six
 import tensorflow as tf
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import gfile
 
 from easy_rec.python.core import sampler as sampler_lib
@@ -1012,10 +1013,12 @@ def _input_fn(mode=None, params=None, config=None):
         return dataset
       elif mode is None:  # serving_input_receiver_fn for export SavedModel
         if export_config.multi_placeholder:
-          inputs, features = self.create_multi_placeholders(export_config)
+          with ops.device('/CPU:0'):
+            inputs, features = self.create_multi_placeholders(export_config)
           return tf.estimator.export.ServingInputReceiver(features, inputs)
         else:
-          inputs, features = self.create_placeholders(export_config)
+          with ops.device('/CPU:0'):
+            inputs, features = self.create_placeholders(export_config)
           print('built feature placeholders. features: {}'.format(
               features.keys()))
           return tf.estimator.export.ServingInputReceiver(features, inputs)
diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 8caa31b80..fa604926d 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -4,68 +4,23 @@
 
 import tensorflow as tf
 
-from easy_rec.python.utils.dag import DAG
 from easy_rec.python.layers import dnn
-from easy_rec.python.layers.common_layers import layer_norm, SENet, highway
-from easy_rec.python.layers.numerical_embedding import PeriodicEmbedding, AutoDisEmbedding
+from easy_rec.python.layers.common_layers import SENet, EnhancedInputLayer
+from easy_rec.python.layers.common_layers import highway, Concatenate
 from easy_rec.python.layers.fibinet import FiBiNetLayer
+from easy_rec.python.layers.fm import FM, FMLayer
 from easy_rec.python.layers.mask_net import MaskNet
-from easy_rec.python.layers.fm import FMLayer
+from easy_rec.python.layers.numerical_embedding import AutoDisEmbedding
+from easy_rec.python.layers.numerical_embedding import PeriodicEmbedding
+from easy_rec.python.utils.dag import DAG
+from easy_rec.python.utils.tf_utils import add_op, dot_op
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
 
 
-class EnhancedInputLayer(object):
-  def __init__(self, config, input_layer, feature_dict):
-    if config.do_batch_norm and config.do_layer_norm:
-      raise ValueError('can not do batch norm and layer norm for input layer at the same time')
-    self._config = config
-    self._input_layer = input_layer
-    self._feature_dict = feature_dict
-
-  def __call__(self, feature_group, is_training, *args, **kwargs):
-    features, feature_list = self._input_layer(self._feature_dict, feature_group)
-    num_features = len(feature_list)
-
-    do_feature_dropout = 0.0 < self._config.feature_dropout_rate < 1.0
-    if self._config.output_feature_list or do_feature_dropout:
-      if self._config.do_layer_norm or self._config.do_batch_norm:
-        for i in range(num_features):
-          fea = feature_list[i]
-          if self._config.do_batch_norm:
-            fea = tf.layers.batch_normalization(fea, training=is_training)
-          elif self._config.do_layer_norm:
-            fea = layer_norm(fea)
-          feature_list[i] = fea
-    elif self._config.do_batch_norm:
-      features = tf.layers.batch_normalization(features, training=is_training)
-    elif self._config.do_layer_norm:
-      features = layer_norm(features)
-
-    if do_feature_dropout and is_training:
-      keep_prob = 1.0 - self._config.feature_dropout_rate
-      bern = tf.distributions.Bernoulli(probs=keep_prob)
-      mask = bern.sample(num_features)
-      for i in range(num_features):
-        fea = tf.div(feature_list[i], keep_prob) * mask[i]
-        feature_list[i] = fea
-      features = tf.concat(feature_list, axis=-1)
-
-    do_dropout = 0.0 < self._config.dropout_rate < 1.0
-    if self._config.output_feature_list:
-      if do_dropout:
-        for i in range(num_features):
-          fea = feature_list[i]
-          fea = tf.layers.dropout(fea, self._config.dropout_rate, training=is_training)
-          feature_list[i] = fea
-      return feature_list
-    if do_dropout:
-      return tf.layers.dropout(features, self._config.dropout_rate, training=is_training)
-    return features
-
-
 class Backbone(object):
+
   def __init__(self, config, model, features, input_layer, l2_reg=None):
     self._model = model
     self._config = config
@@ -77,14 +32,15 @@ def __init__(self, config, model, features, input_layer, l2_reg=None):
     for block in config.blocks:
       self._name_to_blocks[block.name] = block
       self._dag.add_node(block.name)
-    assert len(self._name_to_blocks) > 0, 'there must be more than one block in backbone'
+    num_blocks = len(self._name_to_blocks)
+    assert num_blocks > 0, 'there must be at least one block in backbone'
     for block in config.blocks:
-      assert len(block.inputs) > 0, 'there is no input for block: %s' % block.name
+      assert len(block.inputs) > 0, 'no input for block: %s' % block.name
       for node in block.inputs:
         if node in self._name_to_blocks:
           self._dag.add_edge(node, block.name)
 
-  def block_input(self, config, block_outputs):
+  def block_input(self, config, block_outputs, output_list=False):
     inputs = []
     for input_name in config.inputs:
       if input_name in block_outputs:
@@ -92,26 +48,37 @@ def block_input(self, config, block_outputs):
       else:
         input_feature, _ = self._input_layer(self._features, input_name)
       inputs.append(input_feature)
-    return concat_inputs(inputs, config.name)
+
+    if output_list:
+      output = inputs
+    else:
+      output = concat_inputs(inputs, config.input_concat_axis, config.name)
+
+    if config.HasField('extra_input_fn'):
+      fn = eval(config.extra_input_fn)
+      output = fn(output)
+    return output
 
   def __call__(self, is_training, *args, **kwargs):
     block_outputs = {}
     blocks = self._dag.topological_sort()
-    logging.info("backbone topological order: " + ','.join(blocks))
-    print("backbone topological order: " + ','.join(blocks))
+    logging.info('backbone topological order: ' + ','.join(blocks))
+    print('backbone topological order: ' + ','.join(blocks))
     for block in blocks:
       config = self._name_to_blocks[block]
       layer = config.WhichOneof('layer')
       if layer == 'input_layer':
-        assert len(config.inputs) == 1, 'only one input needed for input_layer: ' + block.name
+        if len(config.inputs) != 1:
+          raise ValueError('only one input allowed for input_layer: ' +
+                           block.name)
         conf = config.input_layer
-        input_layer = EnhancedInputLayer(conf, self._input_layer, self._features)
+        input_layer = EnhancedInputLayer(conf, self._input_layer,
+                                         self._features)
         output = input_layer(config.inputs[0], is_training)
         block_outputs[block] = output
       elif layer == 'periodic_embedding':
         input_feature = self.block_input(config, block_outputs)
-        conf = config.periodic_embedding
-        num_emb = PeriodicEmbedding(conf.embedding_dim, stddev=conf.coef_stddev, scope=block)
+        num_emb = PeriodicEmbedding(config.periodic_embedding, scope=block)
         block_outputs[block] = num_emb(input_feature)
       elif layer == 'auto_dis_embedding':
         input_feature = self.block_input(config, block_outputs)
@@ -121,30 +88,28 @@ def __call__(self, is_training, *args, **kwargs):
         input_feature = self.block_input(config, block_outputs)
         conf = config.highway
         highway_layer = highway(
-          input_feature,
-          conf.emb_size,
-          activation=conf.activation,
-          dropout=conf.dropout_rate,
-          scope=block)
+            input_feature,
+            conf.emb_size,
+            activation=conf.activation,
+            dropout=conf.dropout_rate,
+            scope=block)
         block_outputs[block] = highway_layer(input_feature)
       elif layer == 'mlp':
         input_feature = self.block_input(config, block_outputs)
         mlp = dnn.DNN(
-          config.mlp,
-          self._l2_reg,
-          name='%s_mlp' % block,
-          is_training=is_training)
+            config.mlp,
+            self._l2_reg,
+            name='%s_mlp' % block,
+            is_training=is_training,
+            last_layer_no_activation=config.mlp.last_layer_no_activation,
+            last_layer_no_batch_norm=config.mlp.last_layer_no_batch_norm)
         block_outputs[block] = mlp(input_feature)
       elif layer == 'sequence_encoder':
         block_outputs[block] = self.sequence_encoder(config, is_training)
       elif layer == 'masknet':
         input_feature = self.block_input(config, block_outputs)
-        mask_net = MaskNet(
-          config.masknet,
-          name=block,
-          reuse=tf.AUTO_REUSE)
-        output = mask_net(
-          input_feature, is_training, l2_reg=self._l2_reg)
+        mask_net = MaskNet(config.masknet, name=block, reuse=tf.AUTO_REUSE)
+        output = mask_net(input_feature, is_training, l2_reg=self._l2_reg)
         block_outputs[block] = output
       elif layer == 'senet':
         input_feature = self.block_input(config, block_outputs)
@@ -158,8 +123,28 @@ def __call__(self, is_training, *args, **kwargs):
         block_outputs[block] = output
       elif layer == 'fm':
         input_feature = self.block_input(config, block_outputs)
-        fm = FMLayer()
+        fm = FMLayer(config.fm, name=block)
         block_outputs[block] = fm(input_feature)
+      elif layer == 'concat':
+        input_feature = self.block_input(config, block_outputs)
+        concat = Concatenate(config.concat)
+        block_outputs[block] = concat(input_feature)
+      elif layer == 'reshape':
+        input_feature = self.block_input(config, block_outputs)
+        block_outputs[block] = tf.reshape(input_feature, list(config.reshape.dims))
+      elif layer == 'add':
+        input_feature = self.block_input(config, block_outputs, output_list=True)
+        block_outputs[block] = add_op(input_feature)
+      elif layer == 'dot':
+        input_feature = self.block_input(config, block_outputs)
+        block_outputs[block] = dot_op(input_feature)
+      elif layer == 'Lambda':
+        input_feature = self.block_input(config, block_outputs)
+        fn = eval(config.Lambda.expression)
+        block_outputs[block] = fn(input_feature)
+      elif layer == 'chain':
+        input_feature = self.block_input(config, block_outputs)
+        block_outputs[block] = op_chain(input_feature, config.chain.ops)
       else:
         raise NotImplementedError('Unsupported backbone layer:' + layer)
 
@@ -170,13 +155,17 @@ def __call__(self, is_training, *args, **kwargs):
       else:
         raise ValueError('No output `%s` of backbone to be concat' % output)
 
-    output = concat_inputs(temp)
+    output = concat_inputs(temp, msg='backbone')
     if self._config.HasField('top_mlp'):
+      no_act = self._config.top_mlp.last_layer_no_activation
+      no_bn = self._config.top_mlp.last_layer_no_batch_norm
       final_dnn = dnn.DNN(
-        self._config.top_mlp,
-        self._l2_reg,
-        name='backbone_top_mlp',
-        is_training=is_training)
+          self._config.top_mlp,
+          self._l2_reg,
+          name='backbone_top_mlp',
+          is_training=is_training,
+          last_layer_no_activation=no_act,
+          last_layer_no_batch_norm=no_bn)
       output = final_dnn(output)
     return output
 
@@ -189,22 +178,90 @@ def sequence_encoder(self, config, is_training):
     conf = config.sequence_encoder
     if conf.HasField('mlp'):
       sequence_dnn = dnn.DNN(
-        conf.mlp,
-        self._l2_reg,
-        name='%s_seq_dnn' % config.name,
-        is_training=is_training)
+          conf.mlp,
+          self._l2_reg,
+          name='%s_seq_dnn' % config.name,
+          is_training=is_training)
       encoding = sequence_dnn(encoding)
     return encoding
 
 
-def concat_inputs(inputs, msg=''):
+def concat_inputs(inputs, axis=-1, msg=''):
   if len(inputs) > 1:
-    if type(inputs[0]) == list:
+    if all(map(lambda x: type(x) == list, inputs)):
+      # merge multiple lists into a list
       from functools import reduce
       return reduce(lambda x, y: x + y, inputs)
-    return tf.concat(inputs, axis=-1)
+
+    if axis != -1:
+      logging.info('concat inputs %s axis=%d' % (msg, axis))
+    return tf.concat(inputs, axis=axis)
+
   if len(inputs) == 1:
     return inputs[0]
   raise ValueError('no inputs to be concat:' + msg)
 
 
+def op_chain(inputs, ops):
+  output = inputs
+  for op in ops:
+    op_name = op.WhichOneOf('Op')
+    output = run_op(output, op_name, op, block='op_chain')
+  return output
+
+
+def run_op(inputs, op_name, config, block='', is_training=False, l2_reg=None):
+  if op_name == 'periodic_embedding':
+    num_emb = PeriodicEmbedding(config.periodic_embedding, scope=block)
+    return num_emb(inputs)
+  elif op_name == 'auto_dis_embedding':
+    num_emb = AutoDisEmbedding(config.auto_dis_embedding, scope=block)
+    return num_emb(inputs)
+  elif op_name == 'highway':
+    conf = config.highway
+    highway_op_name = highway(
+      inputs,
+      conf.emb_size,
+      activation=conf.activation,
+      dropout=conf.dropout_rate,
+      scope=block)
+    return highway_op_name(inputs)
+  elif op_name == 'mlp':
+    mlp = dnn.DNN(
+      config.mlp,
+      l2_reg,
+      name='%s_mlp' % block,
+      is_training=is_training,
+      last_layer_no_activation=config.mlp.last_layer_no_activation,
+      last_layer_no_batch_norm=config.mlp.last_layer_no_batch_norm)
+    return mlp(inputs)
+  elif op_name == 'masknet':
+    mask_net = MaskNet(config.masknet, name=block, reuse=tf.AUTO_REUSE)
+    output = mask_net(inputs, is_training, l2_reg=l2_reg)
+    return output
+  elif op_name == 'senet':
+    senet = SENet(config.senet, name=block)
+    output = senet(inputs)
+    return output
+  elif op_name == 'fibinet':
+    fibinet = FiBiNetLayer(config.fibinet, name=block)
+    output = fibinet(inputs, is_training, l2_reg=l2_reg)
+    return output
+  elif op_name == 'fm':
+    fm = FMLayer(config.fm, name=block)
+    return fm(inputs)
+  if op_name == 'Lambda':
+    fn = eval(config.Lambda.expression)
+    output = fn(inputs)
+  elif op_name == 'concat':
+    concat = Concatenate(config.concat)
+    output = concat(inputs)
+  elif op_name == 'reshape':
+    output = tf.reshape(inputs, list(config.reshape.dims))
+  elif op_name == 'add':
+    output = add_op(inputs)
+  elif op_name == 'dot':
+    output = dot_op(inputs)
+  else:
+    raise NotImplementedError('Unsupported op:' + op_name)
+  return output
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index be4615699..a453141f9 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -2,6 +2,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import itertools
 import logging
+
 import six
 import tensorflow as tf
 
@@ -80,6 +81,88 @@ def layer_norm(input_tensor, name=None, reuse=None):
       scope=name)
 
 
+class EnhancedInputLayer(object):
+
+  def __init__(self, config, input_layer, feature_dict):
+    if config.do_batch_norm and config.do_layer_norm:
+      raise ValueError(
+          'can not do batch norm and layer norm for input layer at the same time'
+      )
+    self._config = config
+    self._input_layer = input_layer
+    self._feature_dict = feature_dict
+
+  def __call__(self, feature_group, is_training, *args, **kwargs):
+    features, feature_list = self._input_layer(self._feature_dict,
+                                               feature_group)
+    num_features = len(feature_list)
+
+    do_feature_dropout = 0.0 < self._config.feature_dropout_rate < 1.0
+    if self._config.output_feature_list or do_feature_dropout:
+      if self._config.do_layer_norm or self._config.do_batch_norm:
+        for i in range(num_features):
+          fea = feature_list[i]
+          if self._config.do_batch_norm:
+            fea = tf.layers.batch_normalization(fea, training=is_training)
+          elif self._config.do_layer_norm:
+            fea = layer_norm(fea)
+          feature_list[i] = fea
+    elif self._config.do_batch_norm:
+      features = tf.layers.batch_normalization(features, training=is_training)
+    elif self._config.do_layer_norm:
+      features = layer_norm(features)
+
+    if do_feature_dropout and is_training:
+      keep_prob = 1.0 - self._config.feature_dropout_rate
+      bern = tf.distributions.Bernoulli(probs=keep_prob)
+      mask = bern.sample(num_features)
+      for i in range(num_features):
+        fea = tf.div(feature_list[i], keep_prob) * mask[i]
+        feature_list[i] = fea
+      features = tf.concat(feature_list, axis=-1)
+
+    do_dropout = 0.0 < self._config.dropout_rate < 1.0
+    if self._config.output_feature_list:
+      if do_dropout:
+        for i in range(num_features):
+          fea = feature_list[i]
+          fea = tf.layers.dropout(
+              fea, self._config.dropout_rate, training=is_training)
+          feature_list[i] = fea
+      if self._config.output_3d_tensor:
+        for i in range(num_features):
+          feature_list[i] = tf.expand_dims(feature_list[i], axis=1)
+        return tf.concat(feature_list, axis=1)
+      return feature_list
+
+    if do_dropout:
+      features = tf.layers.dropout(
+          features, self._config.dropout_rate, training=is_training)
+
+    if self._config.output_3d_tensor:
+      dim = int(feature_list[0].shape[-1])
+      return tf.reshape(features, [-1, num_features, dim])
+    return features
+
+
+class Concatenate(object):
+
+  def __init__(self, config):
+    self.config = config
+
+  def __call__(self, inputs, *args, **kwargs):
+    if self.config.HasField('expand_dim_before'):
+      dim = self.config.expand_dim_before
+      output = tf.stack(inputs, axis=dim)
+    else:
+      output = tf.concat(inputs, axis=self.config.axis)
+
+    if self.config.HasField('expand_dim_after'):
+      dim = self.config.expand_dim_after
+      output = tf.expand_dims(output, dim)
+    return output
+
+
 class SENet(object):
   """SENet+ Layer used in FiBiNET，支持不同field的embedding dimension不等.
 
diff --git a/easy_rec/python/layers/fibinet.py b/easy_rec/python/layers/fibinet.py
index 4ba15789e..77b6da4a5 100644
--- a/easy_rec/python/layers/fibinet.py
+++ b/easy_rec/python/layers/fibinet.py
@@ -46,9 +46,9 @@ def __call__(self, inputs, is_training, l2_reg=None, *args, **kwargs):
 
     if self._config.HasField('mlp'):
       final_dnn = dnn.DNN(
-        self._config.mlp,
-        l2_reg,
-        name='%s_fibinet_mlp' % self.name,
-        is_training=is_training)
+          self._config.mlp,
+          l2_reg,
+          name='%s_fibinet_mlp' % self.name,
+          is_training=is_training)
       feature = final_dnn(feature)
     return feature
diff --git a/easy_rec/python/layers/fm.py b/easy_rec/python/layers/fm.py
index 198d6b8d6..87d621d57 100644
--- a/easy_rec/python/layers/fm.py
+++ b/easy_rec/python/layers/fm.py
@@ -19,8 +19,7 @@ def __init__(self, name='fm'):
 
   def __call__(self, fm_fea):
     with tf.name_scope(self._name):
-      fm_feas = [tf.expand_dims(x, axis=1) for x in fm_fea]
-      fm_feas = tf.concat(fm_feas, axis=1)
+      fm_feas = tf.stack(fm_fea, axis=1)
       sum_square = tf.square(tf.reduce_sum(fm_feas, 1))
       square_sum = tf.reduce_sum(tf.square(fm_feas), 1)
       y_v = 0.5 * tf.subtract(sum_square, square_sum)
@@ -28,32 +27,42 @@ def __call__(self, fm_fea):
 
 
 class FMLayer(object):
-  """Factorization Machine models pairwise (order-2) feature interactions
-   without linear term and bias.
-    Input shape
+  """Factorization Machine models pairwise (order-2) feature interactions without linear term and bias.
+
+  References
+    - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+  """
+  def __init__(self, config, name='fm'):
+    self.name = name
+    self.config = config
+
+  def __call__(self, inputs):
+    """FM layer.
+
+    Input shape.
       - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
       - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
     Output shape
       - 2D tensor with shape: ``(batch_size, 1)``.
-    References
-      - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
-  """
-  def __call__(self, inputs):
+    """
     if type(inputs) == list:
-      emb_dims = set()
-      for x in inputs:
-        emb_dims.add(int(x.shape[-1]))
-      assert len(emb_dims) == 1, 'all embedding dim must be the same in FM layer:' + ','.join([str(d) for d in emb_dims])
-      num_fea = len(inputs)
-      emb_dim = emb_dims.pop()
-      fea = tf.concat(inputs, axis=-1)
-      fea = tf.reshape(fea, [-1, num_fea, emb_dim])
+      emb_dims = set(map(lambda x: int(x.shape[-1]), inputs))
+      if len(emb_dims) != 1:
+        dims = ','.join([str(d) for d in emb_dims])
+        raise ValueError('all embedding dim must be equal in FM layer:' + dims)
+
+      with tf.name_scope(self.name):
+        fea = tf.stack(inputs, axis=1)
     else:
       assert inputs.shape.ndims == 3, 'input of FM layer must be a 3D tensor or a list of 2D tensors'
       fea = inputs
 
-    square_of_sum = tf.square(tf.reduce_sum(fea, axis=1, keepdims=True))
-    sum_of_square = tf.reduce_sum(fea * fea, axis=1, keepdims=True)
-    cross_term = square_of_sum - sum_of_square
-    cross_term = 0.5 * tf.reduce_sum(cross_term, axis=2, keepdims=False)
+    with tf.name_scope(self.name):
+      square_of_sum = tf.square(tf.reduce_sum(fea, axis=1))
+      sum_of_square = tf.reduce_sum(fea * fea, axis=1)
+      cross_term = square_of_sum - sum_of_square
+      if self.config.use_variant:
+        cross_term = 0.5 * cross_term
+      else:
+        cross_term = 0.5 * tf.reduce_sum(cross_term, axis=-1)
     return cross_term
diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py
index ced65c0cf..33cd681ad 100644
--- a/easy_rec/python/layers/input_layer.py
+++ b/easy_rec/python/layers/input_layer.py
@@ -4,6 +4,7 @@
 from collections import OrderedDict
 
 import tensorflow as tf
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 
@@ -16,12 +17,9 @@
 from easy_rec.python.layers.common_layers import text_cnn
 from easy_rec.python.layers.fscd_layer import FSCDLayer
 from easy_rec.python.protos.feature_config_pb2 import WideOrDeep
-from easy_rec.python.utils import shape_utils
+from easy_rec.python.utils import shape_utils, conditional
 
-from easy_rec.python.compat.feature_column.feature_column_v2 import EmbeddingColumn  # NOQA
-from easy_rec.python.compat.feature_column.feature_column_v2 import SharedEmbeddingColumn  # NOQA
-
-from easy_rec.python.compat.feature_column.feature_column import _SharedEmbeddingColumn  # NOQA
+from easy_rec.python.compat.feature_column.feature_column_v2 import is_embedding_column
 
 
 class InputLayer(object):
@@ -39,7 +37,7 @@ def __init__(self,
                embedding_regularizer=None,
                kernel_regularizer=None,
                is_training=False,
-               do_feature_normalize=False):
+               is_predicting=False):
     self._feature_configs = feature_configs
     self._feature_groups = {
         x.group_name: FeatureGroup(x) for x in feature_groups_config
@@ -66,8 +64,8 @@ def __init__(self,
     self._embedding_regularizer = embedding_regularizer
     self._kernel_regularizer = kernel_regularizer
     self._is_training = is_training
+    self._is_predicting = is_predicting
     self._variational_dropout_config = variational_dropout_config
-    self._do_feature_normalize = do_feature_normalize
 
   def has_group(self, group_name):
     return group_name in self._feature_groups
@@ -97,7 +95,8 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
     feature_name_to_output_tensors = {}
     negative_sampler = self._feature_groups[group_name]._config.negative_sampler
     if is_combine:
-      concat_features, group_features = self.single_call_input_layer(
+      with conditional(self._is_predicting, ops.device('/CPU:0')):
+        concat_features, group_features = self.single_call_input_layer(
           features, group_name, feature_name_to_output_tensors)
       if group_name in self._group_name_to_seq_features:
         # for target attention
@@ -121,7 +120,7 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
         return concat_features, group_features
     else:  # return sequence feature in raw format instead of combine them
       if self._variational_dropout_config is not None:
-        logging.warn(
+        logging.warning(
             'variational dropout is not supported in not combined mode now.')
 
       feature_group = self._feature_groups[group_name]
@@ -138,13 +137,11 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
             group_columns,
             cols_to_output_tensors=cols_to_output_tensors,
             feature_name_to_output_tensors=feature_name_to_output_tensors,
-            do_normalize=self._do_feature_normalize)
+            sort_feature_columns_by_name=False)
         group_features = [cols_to_output_tensors[x] for x in group_columns]
 
         for col, val in cols_to_output_tensors.items():
-          if isinstance(col, EmbeddingColumn) or isinstance(
-              col, _SharedEmbeddingColumn) or isinstance(
-                  col, SharedEmbeddingColumn):
+          if is_embedding_column(col):
             embedding_reg_lst.append(val)
 
       builder = feature_column._LazyBuilder(features)
@@ -188,8 +185,7 @@ def single_call_input_layer(self,
         features,
         group_columns,
         cols_to_output_tensors=cols_to_output_tensors,
-        feature_name_to_output_tensors=feature_name_to_output_tensors,
-        do_normalize=self._do_feature_normalize)
+        feature_name_to_output_tensors=feature_name_to_output_tensors)
 
     embedding_reg_lst = []
     builder = feature_column._LazyBuilder(features)
@@ -197,7 +193,8 @@ def single_call_input_layer(self,
     for column in sorted(group_seq_columns, key=lambda x: x.name):
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name):
-        seq_feature, seq_len = column._get_sequence_dense_tensor(builder)
+        with conditional(self._is_predicting, ops.device('/CPU:0')):
+          seq_feature, seq_len = column._get_sequence_dense_tensor(builder)
         embedding_reg_lst.append(seq_feature)
 
         sequence_combiner = column.sequence_combiner
@@ -265,8 +262,7 @@ def single_call_input_layer(self,
                        [cols_to_output_tensors[x] for x in group_seq_columns]
 
     for fc, val in cols_to_output_tensors.items():
-      if isinstance(fc, EmbeddingColumn) or isinstance(
-          fc, _SharedEmbeddingColumn) or isinstance(fc, SharedEmbeddingColumn):
+      if is_embedding_column(fc):
         embedding_reg_lst.append(val)
 
     if embedding_reg_lst:
diff --git a/easy_rec/python/layers/mask_net.py b/easy_rec/python/layers/mask_net.py
index 034cd6018..2ec3f5799 100644
--- a/easy_rec/python/layers/mask_net.py
+++ b/easy_rec/python/layers/mask_net.py
@@ -46,8 +46,13 @@ def __call__(self, net, mask_input):
 
     output_size = self.mask_block_config.output_size
     hidden = tf.layers.dense(
-        masked_net, output_size, use_bias=False, name='%s/output' % self.name, reuse=self.reuse)
-    ln_hidden = layer_norm(hidden, name='%s/ln_output' % self.name, reuse=self.reuse)
+        masked_net,
+        output_size,
+        use_bias=False,
+        name='%s/output' % self.name,
+        reuse=self.reuse)
+    ln_hidden = layer_norm(
+        hidden, name='%s/ln_output' % self.name, reuse=self.reuse)
     return tf.nn.relu(ln_hidden)
 
 
diff --git a/easy_rec/python/layers/numerical_embedding.py b/easy_rec/python/layers/numerical_embedding.py
index 26e9f63a3..1c45fa361 100644
--- a/easy_rec/python/layers/numerical_embedding.py
+++ b/easy_rec/python/layers/numerical_embedding.py
@@ -3,21 +3,110 @@
 import math
 
 import tensorflow as tf
-from easy_rec.python.compat.array_ops import repeat
+
+from easy_rec.python.utils.activation import get_activation
+
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
 
 
-class PeriodicEmbedding(object):
+class NLinear(object):
+  """N linear layers for N token (feature) embeddings.
+
+  To understand this module, let's revise `tf.layers.dense`. When `tf.layers.dense` is
+  applied to three-dimensional inputs of the shape
+  ``(batch_size, n_tokens, d_embedding)``, then the same linear transformation is
+  applied to each of ``n_tokens`` token (feature) embeddings.
+
+  By contrast, `NLinear` allocates one linear layer per token (``n_tokens`` layers in total).
+  One such layer can be represented as ``tf.layers.dense(d_in, d_out)``.
+  So, the i-th linear transformation is applied to the i-th token embedding, as
+  illustrated in the following pseudocode::
+
+      layers = [tf.layers.dense(d_in, d_out) for _ in range(n_tokens)]
+      x = tf.random.normal(batch_size, n_tokens, d_in)
+      result = tf.stack([layers[i](x[:, i]) for i in range(n_tokens)], 1)
+
+  Examples:
+      .. testcode::
+
+          batch_size = 2
+          n_features = 3
+          d_embedding_in = 4
+          d_embedding_out = 5
+          x = tf.random.normal(batch_size, n_features, d_embedding_in)
+          m = NLinear(n_features, d_embedding_in, d_embedding_out)
+          assert m(x).shape == (batch_size, n_features, d_embedding_out)
+  """
 
-  def __init__(self, embedding_dim, scope='periodic_embedding', stddev=1.0):
-    """On Embeddings for Numerical Features in Tabular Deep Learning.
+  def __init__(self, n_tokens, d_in, d_out, bias=True, scope='nd_linear'):
+    """Init with input shapes.
 
-    Refer: https://arxiv.org/pdf/2203.05556.pdf
+    Args:
+        n_tokens: the number of tokens (features)
+        d_in: the input dimension
+        d_out: the output dimension
+        bias: indicates if the underlying linear layers have biases
     """
-    self.embedding_dim = embedding_dim // 2
+    with tf.variable_scope(scope):
+      self.weight = tf.get_variable(
+          'weights', [1, n_tokens, d_in, d_out], dtype=tf.float32)
+      if bias:
+        initializer = tf.constant_initializer(0.0)
+        self.bias = tf.get_variable(
+            'bias', [1, n_tokens, d_out],
+            dtype=tf.float32,
+            initializer=initializer)
+      else:
+        self.bias = None
+
+  def __call__(self, x, *args, **kwargs):
+    if x.shape.ndims != 3:
+      raise ValueError(
+          'The input must have three dimensions (batch_size, n_tokens, d_embedding)'
+      )
+    if x.shape[2] != self.weight.shape[2]:
+      raise ValueError('invalid input embedding dimension %d, expect %d' %
+                       (int(x.shape[2]), int(self.weight.shape[2])))
+
+    x = x[..., None] * self.weight  # [B, N, D, D_out]
+    x = tf.reduce_sum(x, axis=-2)  # [B, N, D_out]
+    if self.bias is not None:
+      x = x + self.bias
+    return x
+
+
+class PeriodicEmbedding(object):
+  """Periodic embeddings for numerical features described in [1].
+
+  References:
+    * [1] Yury Gorishniy, Ivan Rubachev, Artem Babenko,
+    "On Embeddings for Numerical Features in Tabular Deep Learning", 2022
+    https://arxiv.org/pdf/2203.05556.pdf
+  """
+
+  def __init__(self, config, scope='periodic_embedding'):
+    """Init with a pb config.
+
+    Args:
+      config: pb config
+      config.embedding_dim: the embedding size, must be an even positive integer.
+      config.sigma: the scale of the weight initialization.
+        **This is a super important parameter which significantly affects performance**.
+        Its optimal value can be dramatically different for different datasets, so
+        no "default value" can exist for this parameter, and it must be tuned for
+        each dataset. In the original paper, during hyperparameter tuning, this
+        parameter was sampled from the distribution ``LogUniform[1e-2, 1e2]``.
+        A similar grid would be ``[1e-2, 1e-1, 1e0, 1e1, 1e2]``.
+        If possible, add more intermidiate values to this grid.
+      config.output_3d_tensor: whether to output a 3d tensor
+    """
+    self.config = config
+    if config.embedding_dim % 2:
+      raise ValueError('embedding_dim must be even')
+    self.emb_dim = config.embedding_dim // 2
     self.scope = scope
-    self.initializer = tf.random_normal_initializer(stddev=stddev)
+    self.initializer = tf.random_normal_initializer(stddev=config.sigma)
 
   def __call__(self, inputs, *args, **kwargs):
     if inputs.shape.ndims != 2:
@@ -26,24 +115,29 @@ def __call__(self, inputs, *args, **kwargs):
     num_features = int(inputs.shape[-1])
     with tf.variable_scope(self.scope):
       c = tf.get_variable(
-          'coef',
-          shape=[1, num_features * self.embedding_dim],
+          'coefficients',
+          shape=[1, num_features, self.emb_dim],
           initializer=self.initializer)
 
-      features = repeat(inputs, self.embedding_dim, axis=1)
-      v = features * c * 2 * math.pi
-      sin_v = tf.split(tf.sin(v), num_features, axis=1)
-      cos_v = tf.split(tf.cos(v), num_features, axis=1)
+      features = inputs[..., None]  # [B, N, 1]
+      v = 2 * math.pi * c * features  # [B, N, E]
+      emb = tf.concat([tf.sin(v), tf.cos(v)], axis=-1)  # [B, N, 2E]
+
+      dim = self.config.embedding_dim
+      if self.config.add_linear_layer:
+        linear = NLinear(num_features, dim, dim)
+        emb = linear(emb)
+        act = get_activation(self.config.linear_activation)
+        if callable(act):
+          emb = act(emb)
 
-      embeddings = []
-      for val in zip(sin_v, cos_v):
-        embedding = tf.concat(val, axis=1)
-        embedding = tf.layers.dense(embedding, int(embedding.shape[-1]), activation=tf.nn.relu)
-        embeddings.append(embedding)
-      return tf.concat(embeddings, axis=1)
+      if self.config.output_3d_tensor:
+        return emb
+      return tf.reshape(emb, [-1, num_features * dim])
 
 
 class AutoDisEmbedding(object):
+
   def __init__(self, config, scope='auto_dis'):
     """An Embedding Learning Framework for Numerical Features in CTR Prediction.
 
@@ -60,21 +154,29 @@ def __call__(self, inputs, *args, **kwargs):
 
     num_features = int(inputs.shape[-1])
     with tf.variable_scope(self.scope):
-      meta_emb = tf.get_variable('meta_embedding', shape=[1, num_features, self.num_bins, self.emb_dim])
+      meta_emb = tf.get_variable(
+          'meta_embedding',
+          shape=[1, num_features, self.num_bins, self.emb_dim])
       w = tf.get_variable('project_w', shape=[1, num_features, self.num_bins])
-      mat = tf.get_variable('project_mat', shape=[1, num_features, self.num_bins, self.num_bins])
+      mat = tf.get_variable(
+          'project_mat', shape=[1, num_features, self.num_bins, self.num_bins])
 
       x = tf.expand_dims(inputs, axis=-1)  # [B, num_fea, 1]
       hidden = tf.nn.leaky_relu(w * x)  # [B, num_fea, num_bin]
 
-      y = tf.matmul(mat, tf.expand_dims(hidden, axis=-1))  # [B, num_fea, num_bin, 1]
+      y = tf.matmul(mat, hidden[..., None])  # [B, num_fea, num_bin, 1]
       y = tf.squeeze(y, axis=3)  # [B, num_fea, num_bin]
 
       # keep_prob(float): if dropout_flag is True, keep_prob rate to keep connect; (float, keep_prob=0.8)
       alpha = self.config.keep_prob
       x_bar = y + alpha * hidden  # [B, num_fea, num_bin]
-      x_hat = tf.nn.softmax(x_bar / self.config.temperature)  # [B, num_fea, num_bin]
+      t = self.config.temperature
+      x_hat = tf.nn.softmax(x_bar / t)  # [B, num_fea, num_bin]
 
-      emb = tf.matmul(tf.expand_dims(x_hat, axis=2), meta_emb)  # [B, num_fea, 1, emb_dim]
+      emb = tf.matmul(x_hat[:, :, None, :], meta_emb)  # [B, num_fea, 1, emb_dim]
       # emb = tf.squeeze(emb, axis=2)  # [B, num_fea, emb_dim]
-      return tf.reshape(emb, [-1, self.emb_dim * num_features])  # [B, num_fea*emb_dim]
+      if self.config.output_3d_tensor:
+        return tf.reshape(
+            emb, [-1, num_features, self.emb_dim])  # [B, num_fea, emb_dim]
+      return tf.reshape(
+          emb, [-1, self.emb_dim * num_features])  # [B, num_fea*emb_dim]
diff --git a/easy_rec/python/model/easy_rec_estimator.py b/easy_rec/python/model/easy_rec_estimator.py
index 51ecad09f..9cbd28b6c 100644
--- a/easy_rec/python/model/easy_rec_estimator.py
+++ b/easy_rec/python/model/easy_rec_estimator.py
@@ -514,7 +514,8 @@ def _export_model_fn(self, features, labels, run_config, params):
         self.feature_configs,
         features,
         labels=None,
-        is_training=False)
+        is_training=False,
+        is_predicting=True)
     model.build_predict_graph()
 
     export_config = self._pipeline_config.export_config
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index c6d864498..331d0282e 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -11,8 +11,8 @@
 from tensorflow.python.ops.variables import PartitionedVariable
 
 from easy_rec.python.compat import regularizers
-from easy_rec.python.layers.backbone import Backbone
 from easy_rec.python.layers import input_layer
+from easy_rec.python.layers.backbone import Backbone
 from easy_rec.python.layers.sequence_encoder import SequenceEncoder
 from easy_rec.python.utils import constant
 from easy_rec.python.utils import estimator_utils
@@ -34,10 +34,12 @@ def __init__(self,
                feature_configs,
                features,
                labels=None,
-               is_training=False):
+               is_training=False,
+               is_predicting=False):
     self._base_model_config = model_config
     self._model_config = model_config
     self._is_training = is_training
+    self._is_predicting = is_predicting
     self._feature_dict = features
 
     # embedding variable parameters
@@ -67,9 +69,12 @@ def __init__(self,
                                              self._l2_reg)
     self._sequence_encoding_by_group_name = {}
     if model_config.HasField('backbone'):
-      self._backbone = Backbone(model_config.backbone, self, features,
-                                input_layer=self._input_layer,
-                                l2_reg=self._l2_reg)
+      self._backbone = Backbone(
+          model_config.backbone,
+          self,
+          features,
+          input_layer=self._input_layer,
+          l2_reg=self._l2_reg)
     else:
       self._backbone = None
 
@@ -120,7 +125,8 @@ def build_input_layer(self, model_config, feature_configs):
         kernel_regularizer=self._l2_reg,
         variational_dropout_config=model_config.variational_dropout
         if model_config.HasField('variational_dropout') else None,
-        is_training=self._is_training)
+        is_training=self._is_training,
+        is_predicting=self._is_predicting)
 
   def get_sequence_encoding(self, group_name=None, is_training=True):
     if group_name is not None:
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index 2b4ccfd21..0285f225c 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -31,10 +31,22 @@ def __init__(self,
 
   def build_predict_graph(self):
     if not self.has_backbone:
-      raise NotImplementedError('method `build_predict_graph` must be implemented when backbone network do not exits')
+      raise NotImplementedError(
+        'method `build_predict_graph` must be implemented when backbone network do not exits'
+      )
+    output = self.backbone
+
+    model_config = getattr(self._base_model_config,
+                           self._base_model_config.WhichOneof('model'))
+    if hasattr(model_config, 'add_head_logits_layer') and \
+        model_config.HasField('add_head_logits_layer'):
+      add_head_logits_layer = model_config.add_head_logits_layer
+    else:
+      add_head_logits_layer = True
+    if add_head_logits_layer:
+      logging.info('add head logits layer for rank model')
+      output = tf.layers.dense(output, self._num_class, name='output')
 
-    net = self.backbone
-    output = tf.layers.dense(net, self._num_class, name='output')
     self._add_to_prediction_dict(output)
     return self._prediction_dict
 
@@ -45,9 +57,9 @@ def _output_to_prediction_impl(self,
                                  suffix=''):
     prediction_dict = {}
     binary_loss_type = {
-        LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
-        LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
-        LossType.PAIRWISE_LOGISTIC_LOSS
+      LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
+      LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
+      LossType.PAIRWISE_LOGISTIC_LOSS
     }
     if loss_type in binary_loss_type:
       assert num_class == 1, 'num_class must be 1 when loss type is %s' % loss_type.name
@@ -74,9 +86,9 @@ def _output_to_prediction_impl(self,
         prediction_dict['logits' + suffix] = output
         prediction_dict['probs' + suffix] = probs
         prediction_dict['logits' + suffix + '_y'] = math_ops.reduce_max(
-            output, axis=1)
+          output, axis=1)
         prediction_dict['probs' + suffix + '_y'] = math_ops.reduce_max(
-            probs, axis=1)
+          probs, axis=1)
         prediction_dict['y' + suffix] = tf.argmax(output, axis=1)
     elif loss_type == LossType.L2_LOSS:
       output = tf.squeeze(output, axis=1)
@@ -89,12 +101,12 @@ def _output_to_prediction_impl(self,
   def _add_to_prediction_dict(self, output):
     if len(self._losses) == 0:
       prediction_dict = self._output_to_prediction_impl(
-          output, loss_type=self._loss_type, num_class=self._num_class)
+        output, loss_type=self._loss_type, num_class=self._num_class)
       self._prediction_dict.update(prediction_dict)
     else:
       for loss in self._losses:
         prediction_dict = self._output_to_prediction_impl(
-            output, loss_type=loss.loss_type, num_class=self._num_class)
+          output, loss_type=loss.loss_type, num_class=self._num_class)
         self._prediction_dict.update(prediction_dict)
 
   def build_rtp_output_dict(self):
@@ -106,9 +118,9 @@ def build_rtp_output_dict(self):
       op = tf.get_default_graph().get_operation_by_name('rank_predict')
       if len(op.outputs) != 1:
         raise ValueError(
-            ('failed to build RTP rank_predict output: op {}[{}] has output ' +
-             'size {}, however 1 is expected.').format(op.name, op.type,
-                                                       len(op.outputs)))
+          ('failed to build RTP rank_predict output: op {}[{}] has output ' +
+           'size {}, however 1 is expected.').format(op.name, op.type,
+                                                     len(op.outputs)))
       rank_predict = op.outputs[0]
     except KeyError:
       forwarded = None
@@ -116,32 +128,32 @@ def build_rtp_output_dict(self):
       if len(self._losses) > 0:
         loss_types = {loss.loss_type for loss in self._losses}
       binary_loss_set = {
-          LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
-          LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
-          LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
-          LossType.JRC_LOSS
+        LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
+        LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
+        LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
+        LossType.JRC_LOSS
       }
       if loss_types & binary_loss_set:
         if 'probs' in self._prediction_dict:
           forwarded = self._prediction_dict['probs']
         else:
           raise ValueError(
-              'failed to build RTP rank_predict output: classification model ' +
-              "expect 'probs' prediction, which is not found. Please check if" +
-              ' build_predict_graph() is called.')
+            'failed to build RTP rank_predict output: classification model ' +
+            "expect 'probs' prediction, which is not found. Please check if" +
+            ' build_predict_graph() is called.')
       elif loss_types & {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
         if 'y' in self._prediction_dict:
           forwarded = self._prediction_dict['y']
         else:
           raise ValueError(
-              'failed to build RTP rank_predict output: regression model expect'
-              +
-              "'y' prediction, which is not found. Please check if build_predic"
-              + 't_graph() is called.')
+            'failed to build RTP rank_predict output: regression model expect'
+            +
+            "'y' prediction, which is not found. Please check if build_predic"
+            + 't_graph() is called.')
       else:
         logging.warning(
-            'failed to build RTP rank_predict: unsupported loss type {}'.format(
-                loss_types))
+          'failed to build RTP rank_predict: unsupported loss type {}'.format(
+            loss_types))
       if forwarded is not None:
         rank_predict = tf.identity(forwarded, name='rank_predict')
     if rank_predict is not None:
@@ -158,9 +170,9 @@ def _build_loss_impl(self,
                        loss_param=None):
     loss_dict = {}
     binary_loss_type = {
-        LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
-        LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
-        LossType.PAIRWISE_LOGISTIC_LOSS, LossType.JRC_LOSS
+      LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
+      LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
+      LossType.PAIRWISE_LOGISTIC_LOSS, LossType.JRC_LOSS
     }
     if loss_type == LossType.CLASSIFICATION:
       loss_name = loss_name if loss_name else 'cross_entropy_loss' + suffix
@@ -184,23 +196,23 @@ def _build_loss_impl(self,
       if hasattr(loss_param, 'session_name'):
         kwargs['session_ids'] = self._feature_dict[loss_param.session_name]
     loss_dict[loss_name] = loss_builder.build(
-        loss_type,
-        self._labels[label_name],
-        pred,
-        loss_weight,
-        num_class,
-        loss_param=loss_param,
-        **kwargs)
+      loss_type,
+      self._labels[label_name],
+      pred,
+      loss_weight,
+      num_class,
+      loss_param=loss_param,
+      **kwargs)
     return loss_dict
 
   def build_loss_graph(self):
     loss_dict = {}
     if len(self._losses) == 0:
       loss_dict = self._build_loss_impl(
-          self._loss_type,
-          label_name=self._label_name,
-          loss_weight=self._sample_weight,
-          num_class=self._num_class)
+        self._loss_type,
+        label_name=self._label_name,
+        loss_weight=self._sample_weight,
+        num_class=self._num_class)
     else:
       strategy = self._base_model_config.loss_weight_strategy
       loss_weight = [1.0]
@@ -212,26 +224,26 @@ def build_loss_graph(self):
         if loss_param is not None:
           loss_param = getattr(loss, loss_param)
         loss_ops = self._build_loss_impl(
-            loss.loss_type,
-            label_name=self._label_name,
-            loss_weight=self._sample_weight,
-            num_class=self._num_class,
-            loss_name=loss.loss_name,
-            loss_param=loss_param)
+          loss.loss_type,
+          label_name=self._label_name,
+          loss_weight=self._sample_weight,
+          num_class=self._num_class,
+          loss_name=loss.loss_name,
+          loss_param=loss_param)
         for loss_name, loss_value in loss_ops.items():
           if strategy == self._base_model_config.Fixed:
             loss_dict[loss_name] = loss_value * loss.weight
           elif strategy == self._base_model_config.Uncertainty:
             if loss.learn_loss_weight:
               uncertainty = tf.Variable(
-                  0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
+                0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
               tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
               if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
                 loss_dict[loss_name] = 0.5 * tf.exp(
-                    -uncertainty) * loss_value + 0.5 * uncertainty
+                  -uncertainty) * loss_value + 0.5 * uncertainty
               else:
                 loss_dict[loss_name] = tf.exp(
-                    -uncertainty) * loss_value + 0.5 * uncertainty
+                  -uncertainty) * loss_value + 0.5 * uncertainty
             else:
               loss_dict[loss_name] = loss_value * loss.weight
           elif strategy == self._base_model_config.Random:
@@ -260,10 +272,10 @@ def _build_metric_impl(self,
     from easy_rec.python.core.easyrec_metrics import metrics_tf
     from easy_rec.python.core import metrics as metrics_lib
     binary_loss_set = {
-        LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
-        LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
-        LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
-        LossType.JRC_LOSS
+      LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
+      LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
+      LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
+      LossType.JRC_LOSS
     }
     metric_dict = {}
     if metric.WhichOneof('metric') == 'auc':
@@ -271,15 +283,15 @@ def _build_metric_impl(self,
       if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['auc' + suffix] = metrics_tf.auc(
-            label,
-            self._prediction_dict['probs' + suffix],
-            num_thresholds=metric.auc.num_thresholds)
+          label,
+          self._prediction_dict['probs' + suffix],
+          num_thresholds=metric.auc.num_thresholds)
       elif num_class == 2:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['auc' + suffix] = metrics_tf.auc(
-            label,
-            self._prediction_dict['probs' + suffix][:, 1],
-            num_thresholds=metric.auc.num_thresholds)
+          label,
+          self._prediction_dict['probs' + suffix][:, 1],
+          num_thresholds=metric.auc.num_thresholds)
       else:
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'gauc':
@@ -289,20 +301,20 @@ def _build_metric_impl(self,
         uids = self._feature_dict[metric.gauc.uid_field]
         if isinstance(uids, tf.sparse.SparseTensor):
           uids = tf.sparse_to_dense(
-              uids.indices, uids.dense_shape, uids.values, default_value='')
+            uids.indices, uids.dense_shape, uids.values, default_value='')
           uids = tf.reshape(uids, [-1])
         metric_dict['gauc' + suffix] = metrics_lib.gauc(
-            label,
-            self._prediction_dict['probs' + suffix],
-            uids=uids,
-            reduction=metric.gauc.reduction)
+          label,
+          self._prediction_dict['probs' + suffix],
+          uids=uids,
+          reduction=metric.gauc.reduction)
       elif num_class == 2:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['gauc' + suffix] = metrics_lib.gauc(
-            label,
-            self._prediction_dict['probs' + suffix][:, 1],
-            uids=self._feature_dict[metric.gauc.uid_field],
-            reduction=metric.gauc.reduction)
+          label,
+          self._prediction_dict['probs' + suffix][:, 1],
+          uids=self._feature_dict[metric.gauc.uid_field],
+          reduction=metric.gauc.reduction)
       else:
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'session_auc':
@@ -310,17 +322,17 @@ def _build_metric_impl(self,
       if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['session_auc' + suffix] = metrics_lib.session_auc(
-            label,
-            self._prediction_dict['probs' + suffix],
-            session_ids=self._feature_dict[metric.session_auc.session_id_field],
-            reduction=metric.session_auc.reduction)
+          label,
+          self._prediction_dict['probs' + suffix],
+          session_ids=self._feature_dict[metric.session_auc.session_id_field],
+          reduction=metric.session_auc.reduction)
       elif num_class == 2:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['session_auc' + suffix] = metrics_lib.session_auc(
-            label,
-            self._prediction_dict['probs' + suffix][:, 1],
-            session_ids=self._feature_dict[metric.session_auc.session_id_field],
-            reduction=metric.session_auc.reduction)
+          label,
+          self._prediction_dict['probs' + suffix][:, 1],
+          session_ids=self._feature_dict[metric.session_auc.session_id_field],
+          reduction=metric.session_auc.reduction)
       else:
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'max_f1':
@@ -328,11 +340,11 @@ def _build_metric_impl(self,
       if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['max_f1' + suffix] = metrics_lib.max_f1(
-            label, self._prediction_dict['logits' + suffix])
+          label, self._prediction_dict['logits' + suffix])
       elif num_class == 2:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['max_f1' + suffix] = metrics_lib.max_f1(
-            label, self._prediction_dict['logits' + suffix][:, 1])
+          label, self._prediction_dict['logits' + suffix][:, 1])
       else:
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'recall_at_topk':
@@ -340,18 +352,18 @@ def _build_metric_impl(self,
       assert num_class > 1
       label = tf.to_int64(self._labels[label_name])
       metric_dict['recall_at_topk' + suffix] = metrics_tf.recall_at_k(
-          label, self._prediction_dict['logits' + suffix],
-          metric.recall_at_topk.topk)
+        label, self._prediction_dict['logits' + suffix],
+        metric.recall_at_topk.topk)
     elif metric.WhichOneof('metric') == 'mean_absolute_error':
       label = tf.to_float(self._labels[label_name])
       if loss_type & {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
         metric_dict['mean_absolute_error' +
                     suffix] = metrics_tf.mean_absolute_error(
-                        label, self._prediction_dict['y' + suffix])
+          label, self._prediction_dict['y' + suffix])
       elif loss_type & {LossType.CLASSIFICATION} and num_class == 1:
         metric_dict['mean_absolute_error' +
                     suffix] = metrics_tf.mean_absolute_error(
-                        label, self._prediction_dict['probs' + suffix])
+          label, self._prediction_dict['probs' + suffix])
       else:
         assert False, 'mean_absolute_error is not supported for this model'
     elif metric.WhichOneof('metric') == 'mean_squared_error':
@@ -359,11 +371,11 @@ def _build_metric_impl(self,
       if loss_type & {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
         metric_dict['mean_squared_error' +
                     suffix] = metrics_tf.mean_squared_error(
-                        label, self._prediction_dict['y' + suffix])
+          label, self._prediction_dict['y' + suffix])
       elif num_class == 1 and loss_type & binary_loss_set:
         metric_dict['mean_squared_error' +
                     suffix] = metrics_tf.mean_squared_error(
-                        label, self._prediction_dict['probs' + suffix])
+          label, self._prediction_dict['probs' + suffix])
       else:
         assert False, 'mean_squared_error is not supported for this model'
     elif metric.WhichOneof('metric') == 'root_mean_squared_error':
@@ -371,11 +383,11 @@ def _build_metric_impl(self,
       if loss_type & {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
         metric_dict['root_mean_squared_error' +
                     suffix] = metrics_tf.root_mean_squared_error(
-                        label, self._prediction_dict['y' + suffix])
+          label, self._prediction_dict['y' + suffix])
       elif loss_type & {LossType.CLASSIFICATION} and num_class == 1:
         metric_dict['root_mean_squared_error' +
                     suffix] = metrics_tf.root_mean_squared_error(
-                        label, self._prediction_dict['probs' + suffix])
+          label, self._prediction_dict['probs' + suffix])
       else:
         assert False, 'root_mean_squared_error is not supported for this model'
     elif metric.WhichOneof('metric') == 'accuracy':
@@ -383,7 +395,7 @@ def _build_metric_impl(self,
       assert num_class > 1
       label = tf.to_int64(self._labels[label_name])
       metric_dict['accuracy' + suffix] = metrics_tf.accuracy(
-          label, self._prediction_dict['y' + suffix])
+        label, self._prediction_dict['y' + suffix])
     return metric_dict
 
   def build_metric_graph(self, eval_config):
@@ -393,18 +405,18 @@ def build_metric_graph(self, eval_config):
       loss_types = {loss.loss_type for loss in self._losses}
     for metric in eval_config.metrics_set:
       metric_dict.update(
-          self._build_metric_impl(
-              metric,
-              loss_type=loss_types,
-              label_name=self._label_name,
-              num_class=self._num_class))
+        self._build_metric_impl(
+          metric,
+          loss_type=loss_types,
+          label_name=self._label_name,
+          num_class=self._num_class))
     return metric_dict
 
   def _get_outputs_impl(self, loss_type, num_class=1, suffix=''):
     binary_loss_set = {
-        LossType.F1_REWEIGHTED_LOSS, LossType.JRC_LOSS, LossType.PAIR_WISE_LOSS,
-        LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
-        LossType.PAIRWISE_LOGISTIC_LOSS
+      LossType.F1_REWEIGHTED_LOSS, LossType.JRC_LOSS, LossType.PAIR_WISE_LOSS,
+      LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
+      LossType.PAIRWISE_LOGISTIC_LOSS
     }
     if loss_type in binary_loss_set:
       return ['probs' + suffix, 'logits' + suffix]
@@ -413,8 +425,8 @@ def _get_outputs_impl(self, loss_type, num_class=1, suffix=''):
         return ['probs' + suffix, 'logits' + suffix]
       else:
         return [
-            'y' + suffix, 'probs' + suffix, 'logits' + suffix,
-            'probs' + suffix + '_y', 'logits' + suffix + '_y'
+          'y' + suffix, 'probs' + suffix, 'logits' + suffix,
+          'probs' + suffix + '_y', 'logits' + suffix + '_y'
         ]
     elif loss_type in [LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS]:
       return ['y' + suffix]
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
index 3dc86cebb..b77be93be 100644
--- a/easy_rec/python/protos/backbone.proto
+++ b/easy_rec/python/protos/backbone.proto
@@ -9,16 +9,46 @@ import "easy_rec/python/protos/masknet.proto";
 
 
 message SequenceLayer {
-    optional DNN mlp = 1;
+    optional MLP mlp = 1;
+}
+
+message Lambda {
+    required string expression = 1;
+}
+
+message Operator {
+    oneof Op {
+        MLP mlp = 102;
+        PeriodicEmbedding periodic_embedding = 103;
+        AutoDisEmbedding auto_dis_embedding = 104;
+        SequenceLayer sequence_encoder = 105;
+        HighWayTower highway = 106;
+        MaskNet masknet = 107;
+        SENet senet = 108;
+        FiBiNetTower fibinet = 109;
+        FM fm = 110;
+        Concatenate concat = 111;
+        Reshape reshape = 112;
+        Add add = 113;
+        Dot dot = 114;
+        Lambda Lambda = 115;
+        OpChain chain = 116;
+    }
+}
+
+message OpChain {
+    repeated Operator ops = 1;
 }
 
 message Block {
     required string name = 1;
     // the input names of feature groups or other blocks
     repeated string inputs = 2;
+    optional int32 input_concat_axis = 3 [default = -1];
+    optional string extra_input_fn = 4;
     oneof layer {
         InputLayer input_layer = 101;
-        DNN mlp = 102;
+        MLP mlp = 102;
         PeriodicEmbedding periodic_embedding = 103;
         AutoDisEmbedding auto_dis_embedding = 104;
         SequenceLayer sequence_encoder = 105;
@@ -27,11 +57,17 @@ message Block {
         SENet senet = 108;
         FiBiNetTower fibinet = 109;
         FM fm = 110;
+        Concatenate concat = 111;
+        Reshape reshape = 112;
+        Add add = 113;
+        Dot dot = 114;
+        Lambda Lambda = 115;
+        OpChain chain = 116;
     }
 }
 
 message BackboneTower {
     repeated Block blocks = 1;
     repeated string concat_blocks = 2;
-    optional DNN top_mlp = 3;
-}
\ No newline at end of file
+    optional MLP top_mlp = 3;
+}
diff --git a/easy_rec/python/protos/dnn.proto b/easy_rec/python/protos/dnn.proto
index 021d34dbb..1564394eb 100644
--- a/easy_rec/python/protos/dnn.proto
+++ b/easy_rec/python/protos/dnn.proto
@@ -12,3 +12,16 @@ message DNN {
     // use batch normalization
     optional bool use_bn = 4 [default = true];
 }
+
+message MLP {
+    // hidden units for each layer
+    repeated uint32 hidden_units = 1;
+    // ratio of dropout
+    repeated float dropout_ratio = 2;
+    // activation function
+    optional string activation = 3 [default = 'tf.nn.relu'];
+    // use batch normalization
+    optional bool use_bn = 4 [default = true];
+    optional bool last_layer_no_activation = 5 [default = false];
+    optional bool last_layer_no_batch_norm = 6 [default = false];
+}
\ No newline at end of file
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 3f4f851b9..940ee88f3 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -31,7 +31,8 @@ message DummyModel {
 }
 // configure backbone network in a free style way
 message RankModel {
-    optional float l2_regularization = 1;
+  optional float l2_regularization = 1;
+  optional bool add_head_logits_layer = 2 [default=true];
 }
 
 // for knowledge distillation
@@ -49,7 +50,6 @@ message KD {
   optional float loss_weight = 4 [default=1.0];
   // only for loss_type == CROSS_ENTROPY_LOSS
   optional float temperature = 5 [default=1.0];
-
 }
 
 message EasyRecModel {
diff --git a/easy_rec/python/protos/fm.proto b/easy_rec/python/protos/fm.proto
index c90af8cab..31d8f27d7 100644
--- a/easy_rec/python/protos/fm.proto
+++ b/easy_rec/python/protos/fm.proto
@@ -2,5 +2,6 @@ syntax = "proto2";
 package protos;
 
 message FM {
+    optional bool use_variant = 1;
     optional float l2_regularization = 5 [default = 1e-4];
 }
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index 5c7bb81a1..576bfdf4f 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -9,6 +9,7 @@ message InputLayer {
     optional float dropout_rate = 3;
     optional float feature_dropout_rate = 4;
     optional bool output_feature_list = 5;
+    optional bool output_3d_tensor = 6;
 }
 
 message HighWayTower {
@@ -20,7 +21,10 @@ message HighWayTower {
 
 message PeriodicEmbedding {
     required uint32 embedding_dim = 1;
-    required float  coef_stddev = 2 [default = 1.0];
+    required float  sigma = 2;
+    optional bool add_linear_layer = 3 [default = true];
+    optional string linear_activation = 4 [default = 'relu'];
+    optional bool output_3d_tensor = 5;
 }
 
 message AutoDisEmbedding {
@@ -28,4 +32,21 @@ message AutoDisEmbedding {
     required uint32 num_bins = 2;
     required float keep_prob = 3 [default = 0.8];
     required float temperature = 4;
+    optional bool output_3d_tensor = 5;
+}
+
+message Concatenate {
+    required int32 axis = 1;
+    optional int32 expand_dim_before = 2;
+    optional int32 expand_dim_after = 3;
+}
+
+message Reshape {
+    repeated int32 dims = 1;
+}
+
+message Add {
+}
+
+message Dot {
 }
\ No newline at end of file
diff --git a/easy_rec/python/protos/seq_encoder.proto b/easy_rec/python/protos/seq_encoder.proto
index 7a608af18..f02490238 100644
--- a/easy_rec/python/protos/seq_encoder.proto
+++ b/easy_rec/python/protos/seq_encoder.proto
@@ -50,4 +50,3 @@ message DINEncoder {
     // option: softmax, sigmoid
     required string attention_normalizer = 3 [default = 'softmax'];
 }
-
diff --git a/easy_rec/python/train_eval.py b/easy_rec/python/train_eval.py
index bdb65eb0a..51c904451 100644
--- a/easy_rec/python/train_eval.py
+++ b/easy_rec/python/train_eval.py
@@ -95,8 +95,13 @@
       help='is use check mode')
   parser.add_argument(
       '--selected_cols', type=str, default=None, help='select input columns')
+  parser.add_argument(
+    '--gpu', type=str, default=None, help='gpu id')
   args, extra_args = parser.parse_known_args()
 
+  if args.gpu is not None:
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+
   edit_config_json = {}
   if args.edit_config_json:
     edit_config_json = json.loads(args.edit_config_json)
diff --git a/easy_rec/python/utils/__init__.py b/easy_rec/python/utils/__init__.py
index e69de29bb..8a9b460ac 100644
--- a/easy_rec/python/utils/__init__.py
+++ b/easy_rec/python/utils/__init__.py
@@ -0,0 +1,17 @@
+
+class conditional(object):
+    """Wrap another context manager and enter it only if condition is true.
+    """
+
+    def __init__(self, condition, contextmanager):
+        self.condition = condition
+        self.contextmanager = contextmanager
+
+    def __enter__(self):
+        """Conditionally enter a context manager."""
+        if self.condition:
+            return self.contextmanager.__enter__()
+
+    def __exit__(self, *args):
+        if self.condition:
+            return self.contextmanager.__exit__(*args)
diff --git a/easy_rec/python/utils/dag.py b/easy_rec/python/utils/dag.py
index 5063c8473..00646f732 100644
--- a/easy_rec/python/utils/dag.py
+++ b/easy_rec/python/utils/dag.py
@@ -1,205 +1,205 @@
-from collections import OrderedDict, defaultdict
-from copy import copy, deepcopy
+from collections import OrderedDict
+from collections import defaultdict
+from copy import copy
+from copy import deepcopy
 
 
 class DAG(object):
-    """ Directed acyclic graph implementation. """
-
-    def __init__(self):
-        """ Construct a new DAG with no nodes or edges. """
-        self.reset_graph()
-
-    def add_node(self, node_name, graph=None):
-        """ Add a node if it does not exist yet, or error out. """
-        if not graph:
-            graph = self.graph
-        if node_name in graph:
-            raise KeyError('node %s already exists' % node_name)
-        graph[node_name] = set()
-
-    def add_node_if_not_exists(self, node_name, graph=None):
-        try:
-            self.add_node(node_name, graph=graph)
-        except KeyError:
-            pass
-
-    def delete_node(self, node_name, graph=None):
-        """ Deletes this node and all edges referencing it. """
-        if not graph:
-            graph = self.graph
-        if node_name not in graph:
-            raise KeyError('node %s does not exist' % node_name)
-        graph.pop(node_name)
-
-        for node, edges in graph.items():
-            if node_name in edges:
-                edges.remove(node_name)
-
-    def delete_node_if_exists(self, node_name, graph=None):
-        try:
-            self.delete_node(node_name, graph=graph)
-        except KeyError:
-            pass
-
-    def add_edge(self, ind_node, dep_node, graph=None):
-        """ Add an edge (dependency) between the specified nodes. """
-        if not graph:
-            graph = self.graph
-        if ind_node not in graph or dep_node not in graph:
-            raise KeyError('one or more nodes do not exist in graph')
-        test_graph = deepcopy(graph)
-        test_graph[ind_node].add(dep_node)
-        is_valid, message = self.validate(test_graph)
-        if is_valid:
-            graph[ind_node].add(dep_node)
-        else:
-            raise Exception()
-
-    def delete_edge(self, ind_node, dep_node, graph=None):
-        """ Delete an edge from the graph. """
-        if not graph:
-            graph = self.graph
-        if dep_node not in graph.get(ind_node, []):
-            raise KeyError('this edge does not exist in graph')
-        graph[ind_node].remove(dep_node)
-
-    def rename_edges(self, old_task_name, new_task_name, graph=None):
-        """ Change references to a task in existing edges. """
-        if not graph:
-            graph = self.graph
-        for node, edges in graph.items():
-
-            if node == old_task_name:
-                graph[new_task_name] = copy(edges)
-                del graph[old_task_name]
-
-            else:
-                if old_task_name in edges:
-                    edges.remove(old_task_name)
-                    edges.add(new_task_name)
-
-    def predecessors(self, node, graph=None):
-        """ Returns a list of all predecessors of the given node """
-        if graph is None:
-            graph = self.graph
-        return [key for key in graph if node in graph[key]]
-
-    def downstream(self, node, graph=None):
-        """ Returns a list of all nodes this node has edges towards. """
-        if graph is None:
-            graph = self.graph
-        if node not in graph:
-            raise KeyError('node %s is not in graph' % node)
-        return list(graph[node])
-
-    def all_downstreams(self, node, graph=None):
-        """Returns a list of all nodes ultimately downstream
-        of the given node in the dependency graph, in
-        topological order."""
-        if graph is None:
-            graph = self.graph
-        nodes = [node]
-        nodes_seen = set()
-        i = 0
-        while i < len(nodes):
-            downstreams = self.downstream(nodes[i], graph)
-            for downstream_node in downstreams:
-                if downstream_node not in nodes_seen:
-                    nodes_seen.add(downstream_node)
-                    nodes.append(downstream_node)
-            i += 1
-        return list(
-            filter(
-                lambda node: node in nodes_seen,
-                self.topological_sort(graph=graph)
-            )
-        )
-
-    def all_leaves(self, graph=None):
-        """ Return a list of all leaves (nodes with no downstreams) """
-        if graph is None:
-            graph = self.graph
-        return [key for key in graph if not graph[key]]
-
-    def from_dict(self, graph_dict):
-        """ Reset the graph and build it from the passed dictionary.
-        The dictionary takes the form of {node_name: [directed edges]}
-        """
-
-        self.reset_graph()
-        for new_node in graph_dict.keys():
-            self.add_node(new_node)
-        for ind_node, dep_nodes in graph_dict.items():
-            if not isinstance(dep_nodes, list):
-                raise TypeError('dict values must be lists')
-            for dep_node in dep_nodes:
-                self.add_edge(ind_node, dep_node)
-
-    def reset_graph(self):
-        """ Restore the graph to an empty state. """
-        self.graph = OrderedDict()
-
-    def ind_nodes(self, graph=None):
-        """ Returns a list of all nodes in the graph with no dependencies. """
-        if graph is None:
-            graph = self.graph
-
-        dependent_nodes = set(
-            node for dependents in graph.values() for node in dependents
-        )
-        return [node for node in graph.keys() if node not in dependent_nodes]
-
-    def validate(self, graph=None):
-        """ Returns (Boolean, message) of whether DAG is valid. """
-        graph = graph if graph is not None else self.graph
-        if len(self.ind_nodes(graph)) == 0:
-            return False, 'no independent nodes detected'
-        try:
-            self.topological_sort(graph)
-        except ValueError:
-            return False, 'failed topological sort'
-        return True, 'valid'
-
-    def topological_sort(self, graph=None):
-        """ Returns a topological ordering of the DAG.
-        Raises an error if this is not possible (graph is not valid).
-        """
-        if graph is None:
-            graph = self.graph
-        result = []
-        in_degree = defaultdict(lambda: 0)
-
-        for u in graph:
-            for v in graph[u]:
-                in_degree[v] += 1
-        ready = [node for node in graph if not in_degree[node]]
-
-        while ready:
-            u = ready.pop()
-            result.append(u)
-            for v in graph[u]:
-                in_degree[v] -= 1
-                if in_degree[v] == 0:
-                    ready.append(v)
-
-        if len(result) == len(graph):
-            return result
-        else:
-            raise ValueError('graph is not acyclic')
-
-    def size(self):
-        return len(self.graph)
+  """Directed acyclic graph implementation."""
+
+  def __init__(self):
+    """Construct a new DAG with no nodes or edges."""
+    self.reset_graph()
+
+  def add_node(self, node_name, graph=None):
+    """Add a node if it does not exist yet, or error out."""
+    if not graph:
+      graph = self.graph
+    if node_name in graph:
+      raise KeyError('node %s already exists' % node_name)
+    graph[node_name] = set()
+
+  def add_node_if_not_exists(self, node_name, graph=None):
+    try:
+      self.add_node(node_name, graph=graph)
+    except KeyError:
+      pass
+
+  def delete_node(self, node_name, graph=None):
+    """Deletes this node and all edges referencing it."""
+    if not graph:
+      graph = self.graph
+    if node_name not in graph:
+      raise KeyError('node %s does not exist' % node_name)
+    graph.pop(node_name)
+
+    for node, edges in graph.items():
+      if node_name in edges:
+        edges.remove(node_name)
+
+  def delete_node_if_exists(self, node_name, graph=None):
+    try:
+      self.delete_node(node_name, graph=graph)
+    except KeyError:
+      pass
+
+  def add_edge(self, ind_node, dep_node, graph=None):
+    """Add an edge (dependency) between the specified nodes."""
+    if not graph:
+      graph = self.graph
+    if ind_node not in graph or dep_node not in graph:
+      raise KeyError('one or more nodes do not exist in graph')
+    test_graph = deepcopy(graph)
+    test_graph[ind_node].add(dep_node)
+    is_valid, message = self.validate(test_graph)
+    if is_valid:
+      graph[ind_node].add(dep_node)
+    else:
+      raise Exception()
+
+  def delete_edge(self, ind_node, dep_node, graph=None):
+    """Delete an edge from the graph."""
+    if not graph:
+      graph = self.graph
+    if dep_node not in graph.get(ind_node, []):
+      raise KeyError('this edge does not exist in graph')
+    graph[ind_node].remove(dep_node)
+
+  def rename_edges(self, old_task_name, new_task_name, graph=None):
+    """Change references to a task in existing edges."""
+    if not graph:
+      graph = self.graph
+    for node, edges in graph.items():
+
+      if node == old_task_name:
+        graph[new_task_name] = copy(edges)
+        del graph[old_task_name]
+
+      else:
+        if old_task_name in edges:
+          edges.remove(old_task_name)
+          edges.add(new_task_name)
+
+  def predecessors(self, node, graph=None):
+    """Returns a list of all predecessors of the given node."""
+    if graph is None:
+      graph = self.graph
+    return [key for key in graph if node in graph[key]]
+
+  def downstream(self, node, graph=None):
+    """Returns a list of all nodes this node has edges towards."""
+    if graph is None:
+      graph = self.graph
+    if node not in graph:
+      raise KeyError('node %s is not in graph' % node)
+    return list(graph[node])
+
+  def all_downstreams(self, node, graph=None):
+    """Returns a list of all nodes ultimately downstream of the given node in the dependency graph.
+
+    in topological order.
+    """
+    if graph is None:
+      graph = self.graph
+    nodes = [node]
+    nodes_seen = set()
+    i = 0
+    while i < len(nodes):
+      downstreams = self.downstream(nodes[i], graph)
+      for downstream_node in downstreams:
+        if downstream_node not in nodes_seen:
+          nodes_seen.add(downstream_node)
+          nodes.append(downstream_node)
+      i += 1
+    return list(
+        filter(lambda node: node in nodes_seen,
+               self.topological_sort(graph=graph)))
+
+  def all_leaves(self, graph=None):
+    """Return a list of all leaves (nodes with no downstreams)."""
+    if graph is None:
+      graph = self.graph
+    return [key for key in graph if not graph[key]]
+
+  def from_dict(self, graph_dict):
+    """Reset the graph and build it from the passed dictionary.
+
+    The dictionary takes the form of {node_name: [directed edges]}
+    """
+    self.reset_graph()
+    for new_node in graph_dict.keys():
+      self.add_node(new_node)
+    for ind_node, dep_nodes in graph_dict.items():
+      if not isinstance(dep_nodes, list):
+        raise TypeError('dict values must be lists')
+      for dep_node in dep_nodes:
+        self.add_edge(ind_node, dep_node)
+
+  def reset_graph(self):
+    """Restore the graph to an empty state."""
+    self.graph = OrderedDict()
+
+  def ind_nodes(self, graph=None):
+    """Returns a list of all nodes in the graph with no dependencies."""
+    if graph is None:
+      graph = self.graph
+
+    dependent_nodes = set(
+        node for dependents in graph.values() for node in dependents)
+    return [node for node in graph.keys() if node not in dependent_nodes]
+
+  def validate(self, graph=None):
+    """Returns (Boolean, message) of whether DAG is valid."""
+    graph = graph if graph is not None else self.graph
+    if len(self.ind_nodes(graph)) == 0:
+      return False, 'no independent nodes detected'
+    try:
+      self.topological_sort(graph)
+    except ValueError:
+      return False, 'failed topological sort'
+    return True, 'valid'
+
+  def topological_sort(self, graph=None):
+    """Returns a topological ordering of the DAG.
+
+    Raises an error if this is not possible (graph is not valid).
+    """
+    if graph is None:
+      graph = self.graph
+    result = []
+    in_degree = defaultdict(lambda: 0)
+
+    for u in graph:
+      for v in graph[u]:
+        in_degree[v] += 1
+    ready = [node for node in graph if not in_degree[node]]
+
+    while ready:
+      u = ready.pop()
+      result.append(u)
+      for v in graph[u]:
+        in_degree[v] -= 1
+        if in_degree[v] == 0:
+          ready.append(v)
+
+    if len(result) == len(graph):
+      return result
+    else:
+      raise ValueError('graph is not acyclic')
+
+  def size(self):
+    return len(self.graph)
 
 
 if __name__ == '__main__':
-    dag = DAG()
-    dag.add_node("a")
-    dag.add_node("b")
-    dag.add_node("c")
-    dag.add_node("d")
-    dag.add_edge("a", "b")
-    dag.add_edge("a", "d")
-    dag.add_edge("b", "c")
-    print(dag.topological_sort())
-    print(dag.graph)
-    print(dag.all_downstreams("b"))
\ No newline at end of file
+  dag = DAG()
+  dag.add_node('a')
+  dag.add_node('b')
+  dag.add_node('c')
+  dag.add_node('d')
+  dag.add_edge('a', 'b')
+  dag.add_edge('a', 'd')
+  dag.add_edge('b', 'c')
+  print(dag.topological_sort())
+  print(dag.graph)
+  print(dag.all_downstreams('b'))
diff --git a/easy_rec/python/utils/tf_utils.py b/easy_rec/python/utils/tf_utils.py
index e1026c132..efcd7df12 100644
--- a/easy_rec/python/utils/tf_utils.py
+++ b/easy_rec/python/utils/tf_utils.py
@@ -46,3 +46,39 @@ def get_config_type(tf_type):
   }
   assert tf_type in type_map, 'invalid type: %s' % tf_type
   return type_map[tf_type]
+
+
+def add_op(inputs):
+  if not isinstance(inputs, list):
+    return inputs
+  if len(inputs) == 1:
+    if isinstance(inputs[0], list):
+      return tf.keras.layers.Add()(inputs[0])
+    return inputs[0]
+  return tf.keras.layers.Add()(inputs)
+
+
+def dot_op(features):
+  """Compute inner dot between any two pair tensors.
+
+  Args:
+    features:
+    - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
+    - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
+  Return:
+    - 2D tensor with shape: ``(batch_size, 1)``.
+  """
+  if isinstance(features, (list, tuple)):
+    features = tf.stack(features, axis=1)
+  assert features.shape.ndims == 3, 'input of dot func must be a 3D tensor or a list of 2D tensors'
+
+  batch_size = tf.shape(features)[0]
+  matrixdot = tf.matmul(features, features, transpose_b=True)
+  feature_dim = matrixdot.shape[-1]
+
+  ones_mat = tf.ones_like(matrixdot)
+  lower_tri_mat = ones_mat - tf.linalg.band_part(ones_mat, 0, -1)
+  lower_tri_mask = tf.cast(lower_tri_mat, tf.bool)
+  result = tf.boolean_mask(matrixdot, lower_tri_mask)
+  output_dim = feature_dim * (feature_dim - 1) // 2
+  return tf.reshape(result, (batch_size, output_dim))
diff --git a/examples/configs/deepfm_backbone_on_criteo.config b/examples/configs/deepfm_backbone_on_criteo.config
index a0982a16e..c94838daf 100644
--- a/examples/configs/deepfm_backbone_on_criteo.config
+++ b/examples/configs/deepfm_backbone_on_criteo.config
@@ -1,25 +1,21 @@
 train_input_path: "examples/data/criteo/criteo_train_data"
 eval_input_path: "examples/data/criteo/criteo_test_data"
-model_dir: "examples/ckpt/deepfm_criteo_ckpt"
+model_dir: "examples/ckpt/deepfm_backbone_criteo"
 
 train_config {
   log_step_count_steps: 500
   optimizer_config: {
     adam_optimizer: {
       learning_rate: {
-        exponential_decay_learning_rate {
-          initial_learning_rate: 0.001
-          decay_steps: 1000
-          decay_factor: 0.5
-          min_learning_rate: 0.00001
+        constant_learning_rate {
+          learning_rate: 0.001
         }
       }
     }
     use_moving_average: false
   }
-  save_checkpoints_steps: 1000
+  save_checkpoints_steps: 20000
   sync_replicas: True
-  num_steps: 20000
 }
 
 eval_config {
@@ -241,110 +237,110 @@ data_config {
 feature_config: {
   features: {
     input_names: "F1"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val:0.0
     max_val: 5775.0
   }
   features: {
     input_names: "F2"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: -3.0
     max_val: 257675.0
   }
   features: {
     input_names: "F3"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 65535.0
   }
   features: {
     input_names: "F4"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 969.0
   }
   features: {
     input_names: "F5"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 23159456.0
   }
   features: {
     input_names: "F6"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 431037.0
   }
   features: {
     input_names: "F7"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 56311.0
   }
   features: {
     input_names: "F8"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 6047.0
   }
   features: {
     input_names: "F9"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 29019.0
   }
   features: {
     input_names: "F10"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 46.0
   }
   features: {
     input_names: "F11"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 231.0
   }
   features: {
     input_names: "F12"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 4008.0
   }
   features: {
     input_names: "F13"
-    embedding_dim:16
+    embedding_dim: 16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 7393.0
   }
   features: {
     input_names: "C1"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 2000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C2"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 1000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C3"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 2500000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -356,132 +352,132 @@ feature_config: {
   }
   features: {
     input_names: "C5"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 500
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C6"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 50
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C7"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 13000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C8"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 1000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C9"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 10
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C10"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 100000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C11"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 6000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C12"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 2000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C13"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 4000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C14"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 100
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C15"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 20000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C16"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 1250000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C17"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 50
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C18"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 6000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C19"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 3000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C20"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 10
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C21"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 1250000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C22"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 50
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C23"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 50
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C24"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 280000
     feature_type: IdFeature
     embedding_dim: 16
   }features: {
     input_names: "C25"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 200
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C26"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 150000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -542,7 +538,9 @@ model_config: {
     blocks {
       name: 'fm'
       inputs: 'emb_list'
-      fm {}
+      fm {
+        use_variant: true
+      }
     }
     blocks {
       name: 'deep'
@@ -552,6 +550,9 @@ model_config: {
       }
     }
     concat_blocks: ['fm', 'deep']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
   }
   rank_model {
     l2_regularization: 1e-5
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
index 1dcdf7512..04dde5589 100644
--- a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
+++ b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
@@ -1,25 +1,21 @@
 train_input_path: "examples/data/criteo/criteo_train_data"
 eval_input_path: "examples/data/criteo/criteo_test_data"
-model_dir: "examples/ckpt/deepfm_autodis_criteo_ckpt"
+model_dir: "examples/ckpt/deepfm_autodis_criteo"
 
 train_config {
   log_step_count_steps: 500
   optimizer_config: {
     adam_optimizer: {
       learning_rate: {
-        exponential_decay_learning_rate {
-          initial_learning_rate: 0.001
-          decay_steps: 1000
-          decay_factor: 0.5
-          min_learning_rate: 0.00001
+        constant_learning_rate {
+          learning_rate: 0.001
         }
       }
     }
     use_moving_average: false
   }
-  save_checkpoints_steps: 1000
+  save_checkpoints_steps: 20000
   sync_replicas: True
-  num_steps: 20000
 }
 
 eval_config {
@@ -241,110 +237,97 @@ data_config {
 feature_config: {
   features: {
     input_names: "F1"
-    embedding_dim:16
     feature_type: RawFeature
     min_val:0.0
     max_val: 5775.0
   }
   features: {
     input_names: "F2"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: -3.0
     max_val: 257675.0
   }
   features: {
     input_names: "F3"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 65535.0
   }
   features: {
     input_names: "F4"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 969.0
   }
   features: {
     input_names: "F5"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 23159456.0
   }
   features: {
     input_names: "F6"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 431037.0
   }
   features: {
     input_names: "F7"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 56311.0
   }
   features: {
     input_names: "F8"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 6047.0
   }
   features: {
     input_names: "F9"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 29019.0
   }
   features: {
     input_names: "F10"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 46.0
   }
   features: {
     input_names: "F11"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 231.0
   }
   features: {
     input_names: "F12"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 4008.0
   }
   features: {
     input_names: "F13"
-    embedding_dim:16
     feature_type: RawFeature
     min_val: 0.0
     max_val: 7393.0
   }
   features: {
     input_names: "C1"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 2000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C2"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 1000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C3"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 2500000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -356,132 +339,132 @@ feature_config: {
   }
   features: {
     input_names: "C5"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 500
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C6"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 50
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C7"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 13000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C8"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 1000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C9"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 10
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C10"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 100000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C11"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 6000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C12"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 2000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C13"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 4000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C14"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 100
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C15"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 20000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C16"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 1250000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C17"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 50
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C18"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 6000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C19"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 3000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C20"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 10
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C21"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 1250000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C22"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 50
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C23"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 50
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C24"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 280000
     feature_type: IdFeature
     embedding_dim: 16
   }features: {
     input_names: "C25"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 200
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C26"
-    hash_bucket_size: 1000000
+    hash_bucket_size: 150000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -489,7 +472,7 @@ feature_config: {
 model_config: {
   model_class: 'RankModel'
   feature_groups: {
-    group_name: "features"
+    group_name: "numerical_features"
     feature_names: "F1"
     feature_names: "F2"
     feature_names: "F3"
@@ -503,6 +486,10 @@ model_config: {
     feature_names: "F11"
     feature_names: "F12"
     feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "categorical_features"
     feature_names: "C1"
     feature_names: "C2"
     feature_names: "C3"
@@ -533,25 +520,51 @@ model_config: {
   }
   backbone {
     blocks {
-      name: 'emb_list'
-      inputs: 'features'
+      name: 'cat_emb'
+      inputs: 'categorical_features'
       input_layer {
-        output_feature_list: true
+        output_3d_tensor: true
+      }
+    }
+    blocks {
+      name: 'num_emb'
+      inputs: 'numerical_features'
+      auto_dis_embedding {
+        embedding_dim: 16
+        num_bins: 20
+        temperature: 0.815
+        output_3d_tensor: true
       }
     }
     blocks {
       name: 'fm'
-      inputs: 'emb_list'
-      fm {}
+      inputs: 'cat_emb'
+      inputs: 'num_emb'
+      input_concat_axis: 1
+      fm {
+        use_variant: true
+      }
+    }
+    blocks {
+      name: 'cat_and_num'
+      inputs: 'cat_emb'
+      inputs: 'num_emb'
+      input_concat_axis: 1
+      reshape {
+        dims: [-1, 624]
+      }
     }
     blocks {
       name: 'deep'
-      inputs: 'features'
+      inputs: 'cat_and_num'
       mlp {
         hidden_units: [256, 128, 64]
       }
     }
     concat_blocks: ['fm', 'deep']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
   }
   rank_model {
     l2_regularization: 1e-5
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_periodic.config b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
new file mode 100644
index 000000000..2affcc9ae
--- /dev/null
+++ b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
@@ -0,0 +1,571 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/deepfm_periodic_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        constant_learning_rate {
+          learning_rate: 0.001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 2000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 2500000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 500
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 50
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 13000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 10
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 100000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 6000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 2000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 4000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 100
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 20000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1250000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 50
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 6000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 3000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 10
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1250000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 50
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 50
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 280000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 200
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 150000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "numerical_features"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "categorical_features"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'cat_emb'
+      inputs: 'categorical_features'
+      input_layer {
+        output_3d_tensor: true
+      }
+    }
+    blocks {
+      name: 'num_emb'
+      inputs: 'numerical_features'
+      periodic_embedding {
+        embedding_dim: 16
+        output_3d_tensor: true
+      }
+    }
+    blocks {
+      name: 'fm'
+      inputs: 'cat_emb'
+      inputs: 'num_emb'
+      input_concat_axis: 1
+      fm {
+        use_variant: true
+      }
+    }
+    blocks {
+      name: 'cat_and_num'
+      inputs: 'cat_emb'
+      inputs: 'num_emb'
+      input_concat_axis: 1
+      reshape {
+        dims: [-1, 624]
+      }
+    }
+    blocks {
+      name: 'deep'
+      inputs: 'cat_and_num'
+      mlp {
+        hidden_units: [256, 128, 64]
+      }
+    }
+    concat_blocks: ['fm', 'deep']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/dlrm_backbone_on_criteo.config b/examples/configs/dlrm_backbone_on_criteo.config
new file mode 100644
index 000000000..7d698e858
--- /dev/null
+++ b/examples/configs/dlrm_backbone_on_criteo.config
@@ -0,0 +1,566 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_backbone_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        constant_learning_rate {
+          learning_rate: 0.001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 2000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 2500000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 500
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 50
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 13000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 10
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 100000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 6000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 2000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 4000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 100
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 20000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1250000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 50
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 6000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 3000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 10
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1250000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 50
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 50
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 280000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 200
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 150000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'bottom_mlp'
+      inputs: 'dense'
+      mlp {
+        hidden_units: [64, 32, 16]
+      }
+    }
+    blocks {
+      name: 'bottom_list'
+      inputs: 'bottom_mlp'
+      Lambda {
+        expression: 'lambda x: [x]'
+      }
+    }
+    blocks {
+      name: 'sparse_features'
+      inputs: 'sparse'
+      input_layer {
+        output_feature_list: true
+      }
+    }
+    blocks {
+      name: 'dot'
+      inputs: 'bottom_list'
+      inputs: 'sparse_features'
+      dot { }
+    }
+    blocks {
+      name: 'dot_and_dense'
+      inputs: 'bottom_mlp'
+      inputs: 'dot'
+      concat {
+        axis: 1
+      }
+    }
+    concat_blocks: ['dot_and_dense']
+    top_mlp {
+      hidden_units: [128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/readme.md b/examples/readme.md
index b95adc8b1..286b292b1 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -73,6 +73,8 @@ EasyRec的模型训练和评估都是基于config配置文件的，配置文件
 
 - [deepfm_on_movielens.config](configs/deepfm_on_movielens.config)
 
+- [deepfm_backbone_on_movielens.config](configs/deepfm_backbone_on_movielens.config)
+
 - [dcn_on_movielens.config](configs/dcn_on_movielens.config)
 
 - [autoint_on_movielens.config](configs/autoint_on_movielens.config)
@@ -85,6 +87,8 @@ EasyRec的模型训练和评估都是基于config配置文件的，配置文件
 
 - [deepfm_on_criteo.config](configs/deepfm_on_criteo.config)
 
+- [deepfm_backbone_on_criteo.config](configs/deepfm_backbone_on_criteo.config)
+
 **召回任务**
 
 - [dssm_on_books.config](configs/dssm_on_books.config)
@@ -209,6 +213,7 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
   | --------- | ----- | ------ |
   | Wide&Deep | 1     | 0.8558 |
   | DeepFM    | 1     | 0.8688 |
+  | DeepFM(Backbone)|1| 0.8876 |
   | DCN       | 1     | 0.8576 |
   | AutoInt   | 1     | 0.8513 |
   | MaskNet   | 1     | 0.8872 |
@@ -220,6 +225,9 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
   | ------ | ----- | ------ |
   | FM     | 1     | 0.7577 |
   | DeepFM | 1     | 0.7967 |
+  | DeepFM(backbone)| 1 | 0.7965 |
+  | DeepFM(periodic)| 1 | 0.7982 |
+  | DeepFM(autodis) | 1 | 0.7983 |
 
 ### 召回模型
 

From 96d502e44cd53cdc92946ffb11069e41d13f835b Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 16 Jun 2023 08:58:50 +0800
Subject: [PATCH 33/54] [feat]: add more backbone blocks

---
 .../compat/feature_column/feature_column.py   | 859 +++++++++---------
 easy_rec/python/layers/backbone.py            | 222 +++--
 easy_rec/python/layers/common_layers.py       |  68 +-
 easy_rec/python/layers/fm.py                  |   5 +-
 easy_rec/python/layers/input_layer.py         |   7 +-
 easy_rec/python/layers/keras/__init__.py      |   1 +
 easy_rec/python/layers/keras/dcn.py           | 182 ++++
 .../python/layers/keras/dot_interaction.py    |  92 ++
 easy_rec/python/layers/numerical_embedding.py |  46 +-
 easy_rec/python/model/easy_rec_model.py       |   9 +-
 easy_rec/python/model/rank_model.py           | 190 ++--
 easy_rec/python/protos/backbone.proto         |  59 +-
 easy_rec/python/protos/dnn.proto              |   2 +-
 easy_rec/python/protos/easy_rec_model.proto   |   1 +
 easy_rec/python/protos/layer.proto            |   9 +-
 easy_rec/python/train_eval.py                 |   5 +-
 easy_rec/python/utils/__init__.py             |  24 +-
 easy_rec/python/utils/load_class.py           |  27 +
 easy_rec/python/utils/tf_utils.py             |   2 +-
 .../configs/deepfm_backbone_on_criteo.config  | 136 ++-
 ...pfm_backbone_on_criteo_with_autodis.config | 259 +++++-
 ...fm_backbone_on_criteo_with_periodic.config | 259 +++++-
 .../configs/dlrm_backbone_on_criteo.config    |  97 +-
 examples/configs/dlrm_on_criteo.config        | 534 +++++++++++
 .../dlrm_on_criteo_with_autodis.config        | 578 ++++++++++++
 .../configs/dlrm_standard_on_criteo.config    | 560 ++++++++++++
 examples/data/criteo/process_criteo_kaggle.py |   6 +
 examples/readme.md                            |  36 +-
 28 files changed, 3363 insertions(+), 912 deletions(-)
 create mode 100644 easy_rec/python/layers/keras/__init__.py
 create mode 100644 easy_rec/python/layers/keras/dcn.py
 create mode 100644 easy_rec/python/layers/keras/dot_interaction.py
 create mode 100644 examples/configs/dlrm_on_criteo.config
 create mode 100644 examples/configs/dlrm_on_criteo_with_autodis.config
 create mode 100644 examples/configs/dlrm_standard_on_criteo.config

diff --git a/easy_rec/python/compat/feature_column/feature_column.py b/easy_rec/python/compat/feature_column/feature_column.py
index d0f23dfbb..27557e9a7 100644
--- a/easy_rec/python/compat/feature_column/feature_column.py
+++ b/easy_rec/python/compat/feature_column/feature_column.py
@@ -167,7 +167,6 @@
 
 from easy_rec.python.compat import embedding_ops as ev_embedding_ops
 from easy_rec.python.compat.feature_column import utils as fc_utils
-from easy_rec.python.layers.common_layers import layer_norm
 
 
 def _internal_input_layer(features,
@@ -185,9 +184,9 @@ def _internal_input_layer(features,
   for column in feature_columns:
     if not isinstance(column, _DenseColumn):
       raise ValueError(
-        'Items of feature_columns must be a _DenseColumn. '
-        'You can wrap a categorical column with an '
-        'embedding_column or indicator_column. Given: {}'.format(column))
+          'Items of feature_columns must be a _DenseColumn. '
+          'You can wrap a categorical column with an '
+          'embedding_column or indicator_column. Given: {}'.format(column))
   weight_collections = list(weight_collections or [])
   if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
     weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
@@ -205,20 +204,20 @@ def _get_logits():  # pylint: disable=missing-docstring
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name):  # pylint: disable=protected-access
         tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-          builder,
-          weight_collections=weight_collections,
-          trainable=trainable)
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
         num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
         batch_size = array_ops.shape(tensor)[0]
         output_tensor = array_ops.reshape(
-          tensor, shape=(batch_size, num_elements))
+            tensor, shape=(batch_size, num_elements))
         output_tensors.append(output_tensor)
         if cols_to_vars is not None:
           # Retrieve any variables created (some _DenseColumn's don't create
           # variables, in which case an empty list is returned).
           cols_to_vars[column] = ops.get_collection(
-            ops.GraphKeys.GLOBAL_VARIABLES,
-            scope=variable_scope.get_variable_scope().name)
+              ops.GraphKeys.GLOBAL_VARIABLES,
+              scope=variable_scope.get_variable_scope().name)
         if cols_to_output_tensors is not None:
           cols_to_output_tensors[column] = output_tensor
         if feature_name_to_output_tensors is not None:
@@ -303,14 +302,14 @@ def input_layer(features,
     ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
   """
   return _internal_input_layer(
-    features,
-    feature_columns,
-    weight_collections=weight_collections,
-    trainable=trainable,
-    cols_to_vars=cols_to_vars,
-    cols_to_output_tensors=cols_to_output_tensors,
-    feature_name_to_output_tensors=feature_name_to_output_tensors,
-    sort_feature_columns_by_name=sort_feature_columns_by_name)
+      features,
+      feature_columns,
+      weight_collections=weight_collections,
+      trainable=trainable,
+      cols_to_vars=cols_to_vars,
+      cols_to_output_tensors=cols_to_output_tensors,
+      feature_name_to_output_tensors=feature_name_to_output_tensors,
+      sort_feature_columns_by_name=sort_feature_columns_by_name)
 
 
 # TODO(akshayka): InputLayer should be a subclass of Layer, and it
@@ -334,17 +333,17 @@ def __init__(self,
     self._cols_to_vars = cols_to_vars
     self._name = name
     self._input_layer_template = template.make_template(
-      self._name, _internal_input_layer, create_scope_now_=create_scope_now)
+        self._name, _internal_input_layer, create_scope_now_=create_scope_now)
     self._scope = self._input_layer_template.variable_scope
 
   def __call__(self, features):
     return self._input_layer_template(
-      features=features,
-      feature_columns=self._feature_columns,
-      weight_collections=self._weight_collections,
-      trainable=self._trainable,
-      cols_to_vars=None,
-      from_template=True)
+        features=features,
+        feature_columns=self._feature_columns,
+        weight_collections=self._weight_collections,
+        trainable=self._trainable,
+        cols_to_vars=None,
+        from_template=True)
 
   @property
   def name(self):
@@ -500,12 +499,12 @@ def linear_model(features,
   with variable_scope.variable_scope(None, 'linear_model') as vs:
     model_name = _strip_leading_slashes(vs.name)
   linear_model_layer = _LinearModel(
-    feature_columns=feature_columns,
-    units=units,
-    sparse_combiner=sparse_combiner,
-    weight_collections=weight_collections,
-    trainable=trainable,
-    name=model_name)
+      feature_columns=feature_columns,
+      units=units,
+      sparse_combiner=sparse_combiner,
+      weight_collections=weight_collections,
+      trainable=trainable,
+      name=model_name)
   retval = linear_model_layer(features)  # pylint: disable=not-callable
   if cols_to_vars is not None:
     cols_to_vars.update(linear_model_layer.cols_to_vars())
@@ -549,7 +548,7 @@ def __init__(self,
                name=None,
                **kwargs):
     super(_FCLinearWrapper, self).__init__(
-      trainable=trainable, name=name, **kwargs)
+        trainable=trainable, name=name, **kwargs)
     self._feature_column = feature_column
     self._units = units
     self._sparse_combiner = sparse_combiner
@@ -558,30 +557,30 @@ def __init__(self,
   def build(self, _):
     if isinstance(self._feature_column, _CategoricalColumn):
       weight = self.add_variable(
-        name='weights',
-        shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
-        initializer=init_ops.zeros_initializer(),
-        trainable=self.trainable)
+          name='weights',
+          shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
     else:
       num_elements = self._feature_column._variable_shape.num_elements()  # pylint: disable=protected-access
       weight = self.add_variable(
-        name='weights',
-        shape=[num_elements, self._units],
-        initializer=init_ops.zeros_initializer(),
-        trainable=self.trainable)
+          name='weights',
+          shape=[num_elements, self._units],
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
     _add_to_collections(weight, self._weight_collections)
     self._weight_var = weight
     self.built = True
 
   def call(self, builder):
     weighted_sum = _create_weighted_sum(
-      column=self._feature_column,
-      builder=builder,
-      units=self._units,
-      sparse_combiner=self._sparse_combiner,
-      weight_collections=self._weight_collections,
-      trainable=self.trainable,
-      weight_var=self._weight_var)
+        column=self._feature_column,
+        builder=builder,
+        units=self._units,
+        sparse_combiner=self._sparse_combiner,
+        weight_collections=self._weight_collections,
+        trainable=self.trainable,
+        weight_var=self._weight_var)
     return weighted_sum
 
 
@@ -600,10 +599,10 @@ def __init__(self,
 
   def build(self, _):
     self._bias_variable = self.add_variable(
-      'bias_weights',
-      shape=[self._units],
-      initializer=init_ops.zeros_initializer(),
-      trainable=self.trainable)
+        'bias_weights',
+        shape=[self._units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=self.trainable)
     _add_to_collections(self._bias_variable, self._weight_collections)
     self.built = True
 
@@ -659,11 +658,11 @@ def __init__(self,
       column_layers[column_name] = column_layer
     self._column_layers = self._add_layers(column_layers)
     self._bias_layer = _BiasLayer(
-      units=units,
-      trainable=trainable,
-      weight_collections=self._weight_collections,
-      name='bias_layer',
-      **kwargs)
+        units=units,
+        trainable=trainable,
+        weight_collections=self._weight_collections,
+        name='bias_layer',
+        **kwargs)
     self._cols_to_vars = {}
 
   def cols_to_vars(self):
@@ -679,8 +678,8 @@ def call(self, features):
       for column in self._feature_columns:
         if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
           raise ValueError(
-            'Items of feature_columns must be either a '
-            '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
+              'Items of feature_columns must be either a '
+              '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
       weighted_sums = []
       ordered_columns = []
       builder = _LazyBuilder(features)
@@ -690,17 +689,17 @@ def call(self, features):
         weighted_sum = layer(builder)
         weighted_sums.append(weighted_sum)
         self._cols_to_vars[column] = ops.get_collection(
-          ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
+            ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
 
       _verify_static_batch_size_equality(weighted_sums, ordered_columns)
       predictions_no_bias = math_ops.add_n(
-        weighted_sums, name='weighted_sum_no_bias')
+          weighted_sums, name='weighted_sum_no_bias')
       predictions = nn_ops.bias_add(
-        predictions_no_bias,
-        self._bias_layer(  # pylint: disable=not-callable
-          builder,
-          scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
-        name='weighted_sum')
+          predictions_no_bias,
+          self._bias_layer(  # pylint: disable=not-callable
+              builder,
+              scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
+          name='weighted_sum')
       bias = self._bias_layer.variables[0]
       self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
     return predictions
@@ -905,31 +904,31 @@ def model_fn(features, ...):
   if (initializer is not None) and (not callable(initializer)):
     raise ValueError('initializer must be callable if specified. '
                      'Embedding of column_name: {}'.format(
-      categorical_column.name))
+                         categorical_column.name))
   if initializer is None:
     initializer = init_ops.truncated_normal_initializer(
-      mean=0.0, stddev=0.01 / math.sqrt(dimension))
+        mean=0.0, stddev=0.01 / math.sqrt(dimension))
 
   embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
 
   def _creator(weight_collections, scope):
     embedding_column_layer = _EmbeddingColumnLayer(
-      embedding_shape=embedding_shape,
-      initializer=initializer,
-      weight_collections=weight_collections,
-      trainable=trainable,
-      name='embedding_column_layer')
+        embedding_shape=embedding_shape,
+        initializer=initializer,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        name='embedding_column_layer')
     return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
 
   return _EmbeddingColumn(
-    categorical_column=categorical_column,
-    dimension=dimension,
-    combiner=combiner,
-    layer_creator=_creator,
-    ckpt_to_load_from=ckpt_to_load_from,
-    tensor_name_in_ckpt=tensor_name_in_ckpt,
-    max_norm=max_norm,
-    trainable=trainable)
+      categorical_column=categorical_column,
+      dimension=dimension,
+      combiner=combiner,
+      layer_creator=_creator,
+      ckpt_to_load_from=ckpt_to_load_from,
+      tensor_name_in_ckpt=tensor_name_in_ckpt,
+      max_norm=max_norm,
+      trainable=trainable)
 
 
 def _numeric_column(key,
@@ -996,15 +995,15 @@ def _numeric_column(key,
 
   if normalizer_fn is not None and not callable(normalizer_fn):
     raise TypeError(
-      'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   fc_utils.assert_key_is_string(key)
   return _NumericColumn(
-    key,
-    shape=shape,
-    default_value=default_value,
-    dtype=dtype,
-    normalizer_fn=normalizer_fn)
+      key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
 
 
 def _bucketized_column(source_column, boundaries):
@@ -1075,8 +1074,8 @@ def _bucketized_column(source_column, boundaries):
   """
   if not isinstance(source_column, _NumericColumn):
     raise ValueError(
-      'source_column must be a column generated with numeric_column(). '
-      'Given: {}'.format(source_column))
+        'source_column must be a column generated with numeric_column(). '
+        'Given: {}'.format(source_column))
   if len(source_column.shape) > 1:
     raise ValueError('source_column must be one-dimensional column. '
                      'Given: {}'.format(source_column))
@@ -1139,7 +1138,7 @@ def _categorical_column_with_hash_bucket(key,
   if hash_bucket_size < 1:
     raise ValueError('hash_bucket_size must be at least 1. '
                      'hash_bucket_size: {}, key: {}'.format(
-      hash_bucket_size, key))
+                         hash_bucket_size, key))
 
   fc_utils.assert_key_is_string(key)
   fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
@@ -1241,8 +1240,8 @@ def _categorical_column_with_vocabulary_file(key,
     with gfile.GFile(vocabulary_file) as f:
       vocabulary_size = sum(1 for _ in f)
     logging.info(
-      'vocabulary_size = %d in %s is inferred from the number of elements '
-      'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
+        'vocabulary_size = %d in %s is inferred from the number of elements '
+        'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
 
   # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
   if vocabulary_size < 1:
@@ -1250,20 +1249,20 @@ def _categorical_column_with_vocabulary_file(key,
   if num_oov_buckets:
     if default_value is not None:
       raise ValueError(
-        'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
-          key))
+          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
+              key))
     if num_oov_buckets < 0:
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
-        num_oov_buckets, key))
+          num_oov_buckets, key))
   fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
   fc_utils.assert_key_is_string(key)
   return _VocabularyFileCategoricalColumn(
-    key=key,
-    vocabulary_file=vocabulary_file,
-    vocabulary_size=vocabulary_size,
-    num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
-    default_value=-1 if default_value is None else default_value,
-    dtype=dtype)
+      key=key,
+      vocabulary_file=vocabulary_file,
+      vocabulary_size=vocabulary_size,
+      num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
+      default_value=-1 if default_value is None else default_value,
+      dtype=dtype)
 
 
 def _categorical_column_with_vocabulary_list(key,
@@ -1348,38 +1347,38 @@ def _categorical_column_with_vocabulary_list(key,
   """
   if (vocabulary_list is None) or (len(vocabulary_list) < 1):
     raise ValueError(
-      'vocabulary_list {} must be non-empty, column_name: {}'.format(
-        vocabulary_list, key))
+        'vocabulary_list {} must be non-empty, column_name: {}'.format(
+            vocabulary_list, key))
   if len(set(vocabulary_list)) != len(vocabulary_list):
     raise ValueError(
-      'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
-        vocabulary_list, key))
+        'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
+            vocabulary_list, key))
   vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
   if num_oov_buckets:
     if default_value != -1:
       raise ValueError(
-        'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
-          key))
+          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
+              key))
     if num_oov_buckets < 0:
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
-        num_oov_buckets, key))
+          num_oov_buckets, key))
   fc_utils.assert_string_or_int(
-    vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
+      vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
   if dtype is None:
     dtype = vocabulary_dtype
   elif dtype.is_integer != vocabulary_dtype.is_integer:
     raise ValueError(
-      'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
-        dtype, vocabulary_dtype, key))
+        'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
+            dtype, vocabulary_dtype, key))
   fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
   fc_utils.assert_key_is_string(key)
 
   return _VocabularyListCategoricalColumn(
-    key=key,
-    vocabulary_list=tuple(vocabulary_list),
-    dtype=dtype,
-    default_value=default_value,
-    num_oov_buckets=num_oov_buckets)
+      key=key,
+      vocabulary_list=tuple(vocabulary_list),
+      dtype=dtype,
+      default_value=default_value,
+      num_oov_buckets=num_oov_buckets)
 
 
 def _categorical_column_with_identity(key, num_buckets, default_value=None):
@@ -1438,15 +1437,15 @@ def _categorical_column_with_identity(key, num_buckets, default_value=None):
   """
   if num_buckets < 1:
     raise ValueError('num_buckets {} < 1, column_name {}'.format(
-      num_buckets, key))
+        num_buckets, key))
   if (default_value is not None) and ((default_value < 0) or
                                       (default_value >= num_buckets)):
     raise ValueError(
-      'default_value {} not in range [0, {}), column_name {}'.format(
-        default_value, num_buckets, key))
+        'default_value {} not in range [0, {}), column_name {}'.format(
+            default_value, num_buckets, key))
   fc_utils.assert_key_is_string(key)
   return _IdentityCategoricalColumn(
-    key=key, num_buckets=num_buckets, default_value=default_value)
+      key=key, num_buckets=num_buckets, default_value=default_value)
 
 
 def _indicator_column(categorical_column):
@@ -1553,9 +1552,9 @@ def _weighted_categorical_column(categorical_column,
   if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype {} is not convertible to float.'.format(dtype))
   return _WeightedCategoricalColumn(
-    categorical_column=categorical_column,
-    weight_feature_key=weight_feature_key,
-    dtype=dtype)
+      categorical_column=categorical_column,
+      weight_feature_key=weight_feature_key,
+      dtype=dtype)
 
 
 def _crossed_column(keys, hash_bucket_size, hash_key=None):
@@ -1667,21 +1666,21 @@ def _crossed_column(keys, hash_bucket_size, hash_key=None):
                      'hash_bucket_size: {}'.format(hash_bucket_size))
   if not keys or len(keys) < 2:
     raise ValueError(
-      'keys must be a list with length > 1. Given: {}'.format(keys))
+        'keys must be a list with length > 1. Given: {}'.format(keys))
   for key in keys:
     if (not isinstance(key, six.string_types) and
         not isinstance(key, _CategoricalColumn)):
       raise ValueError(
-        'Unsupported key type. All keys must be either string, or '
-        'categorical column except _HashedCategoricalColumn. '
-        'Given: {}'.format(key))
+          'Unsupported key type. All keys must be either string, or '
+          'categorical column except _HashedCategoricalColumn. '
+          'Given: {}'.format(key))
     if isinstance(key, _HashedCategoricalColumn):
       raise ValueError(
-        'categorical_column_with_hash_bucket is not supported for crossing. '
-        'Hashing before crossing will increase probability of collision. '
-        'Instead, use the feature name as a string. Given: {}'.format(key))
+          'categorical_column_with_hash_bucket is not supported for crossing. '
+          'Hashing before crossing will increase probability of collision. '
+          'Instead, use the feature name as a string. Given: {}'.format(key))
   return _CrossedColumn(
-    keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)
+      keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)
 
 
 # TODO(rohanj): Clearly define semantics of this layer.
@@ -1710,7 +1709,7 @@ def __init__(self,
       **kwargs: keyword named properties.
     """
     super(_EmbeddingColumnLayer, self).__init__(
-      trainable=trainable, name=name, **kwargs)
+        trainable=trainable, name=name, **kwargs)
     self._embedding_shape = embedding_shape
     self._initializer = initializer
     self._weight_collections = weight_collections
@@ -1726,11 +1725,11 @@ def set_weight_collections(self, weight_collections):
 
   def build(self, _):
     self._embedding_weight_var = self.add_variable(
-      name='embedding_weights',
-      shape=self._embedding_shape,
-      dtype=dtypes.float32,
-      initializer=self._initializer,
-      trainable=self.trainable)
+        name='embedding_weights',
+        shape=self._embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self._initializer,
+        trainable=self.trainable)
     if self._weight_collections and not context.executing_eagerly():
       _add_to_collections(self._embedding_weight_var, self._weight_collections)
     self.built = True
@@ -1876,21 +1875,21 @@ def _create_weighted_sum(column,
   """Creates a weighted sum for a dense/categorical column for linear_model."""
   if isinstance(column, _CategoricalColumn):
     return _create_categorical_column_weighted_sum(
-      column=column,
-      builder=builder,
-      units=units,
-      sparse_combiner=sparse_combiner,
-      weight_collections=weight_collections,
-      trainable=trainable,
-      weight_var=weight_var)
+        column=column,
+        builder=builder,
+        units=units,
+        sparse_combiner=sparse_combiner,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        weight_var=weight_var)
   else:
     return _create_dense_column_weighted_sum(
-      column=column,
-      builder=builder,
-      units=units,
-      weight_collections=weight_collections,
-      trainable=trainable,
-      weight_var=weight_var)
+        column=column,
+        builder=builder,
+        units=units,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        weight_var=weight_var)
 
 
 def _create_dense_column_weighted_sum(column,
@@ -1901,9 +1900,9 @@ def _create_dense_column_weighted_sum(column,
                                       weight_var=None):
   """Create a weighted sum of a dense column for linear_model."""
   tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-    builder,
-    weight_collections=weight_collections,
-    trainable=trainable)
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
@@ -1911,11 +1910,11 @@ def _create_dense_column_weighted_sum(column,
     weight = weight_var
   else:
     weight = variable_scope.get_variable(
-      name='weights',
-      shape=[num_elements, units],
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+        name='weights',
+        shape=[num_elements, units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return math_ops.matmul(tensor, weight, name='weighted_sum')
 
 
@@ -1929,7 +1928,7 @@ class _CategoricalColumn(_FeatureColumn):
   """
 
   IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
-    'IdWeightPair', ['id_tensor', 'weight_tensor'])
+      'IdWeightPair', ['id_tensor', 'weight_tensor'])
 
   @abc.abstractproperty
   def _num_buckets(self):
@@ -1999,39 +1998,39 @@ def _create_categorical_column_weighted_sum(column,
   sparse_combiner = "sum".
   """
   sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
-    builder,
-    weight_collections=weight_collections,
-    trainable=trainable)
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
   id_tensor = sparse_ops.sparse_reshape(
-    sparse_tensors.id_tensor,
-    [array_ops.shape(sparse_tensors.id_tensor)[0], -1])
+      sparse_tensors.id_tensor,
+      [array_ops.shape(sparse_tensors.id_tensor)[0], -1])
   weight_tensor = sparse_tensors.weight_tensor
   if weight_tensor is not None:
     weight_tensor = sparse_ops.sparse_reshape(
-      weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
+        weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
   if weight_var is not None:
     weight = weight_var
   else:
     weight = variable_scope.get_variable(
-      name='weights',
-      shape=(column._num_buckets, units),  # pylint: disable=protected-access
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+        name='weights',
+        shape=(column._num_buckets, units),  # pylint: disable=protected-access
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return embedding_ops.safe_embedding_lookup_sparse(
-    weight,
-    id_tensor,
-    sparse_weights=weight_tensor,
-    combiner=sparse_combiner,
-    name='weighted_sum')
+      weight,
+      id_tensor,
+      sparse_weights=weight_tensor,
+      combiner=sparse_combiner,
+      name='weighted_sum')
 
 
 class _SequenceDenseColumn(_FeatureColumn):
   """Represents dense sequence data."""
 
   TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
-    'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
+      'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
 
   @abc.abstractmethod
   def _get_sequence_dense_tensor(self,
@@ -2147,7 +2146,7 @@ def _get_raw_feature_as_tensor(self, key):
     """
     raw_feature = self._features[key]
     feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
-      raw_feature)
+        raw_feature)
 
     def expand_dims(input_tensor):
       # Input_tensor must have rank 1.
@@ -2161,20 +2160,20 @@ def expand_dims(input_tensor):
     if rank is not None:
       if rank == 0:
         raise ValueError(
-          'Feature (key: {}) cannot have rank 0. Give: {}'.format(
-            key, feature_tensor))
+            'Feature (key: {}) cannot have rank 0. Give: {}'.format(
+                key, feature_tensor))
       return feature_tensor if rank != 1 else expand_dims(feature_tensor)
 
     # Handle dynamic rank.
     with ops.control_dependencies([
-      check_ops.assert_positive(
-        array_ops.rank(feature_tensor),
-        message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
-          key, feature_tensor))
+        check_ops.assert_positive(
+            array_ops.rank(feature_tensor),
+            message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
+                key, feature_tensor))
     ]):
       return control_flow_ops.cond(
-        math_ops.equal(1, array_ops.rank(feature_tensor)),
-        lambda: expand_dims(feature_tensor), lambda: feature_tensor)
+          math_ops.equal(1, array_ops.rank(feature_tensor)),
+          lambda: expand_dims(feature_tensor), lambda: feature_tensor)
 
 
 # TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
@@ -2209,7 +2208,7 @@ def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):
     ValueError: when `input_tensor`'s rank is `None`.
   """
   input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
-    input_tensor)
+      input_tensor)
   if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
     return input_tensor
   with ops.name_scope(None, 'to_sparse_input', (
@@ -2228,14 +2227,14 @@ def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):
         # default value for that type.
         ignore_value = input_tensor.dtype.as_numpy_dtype()
     ignore_value = math_ops.cast(
-      ignore_value, input_tensor.dtype, name='ignore_value')
+        ignore_value, input_tensor.dtype, name='ignore_value')
     indices = array_ops.where(
-      math_ops.not_equal(input_tensor, ignore_value), name='indices')
+        math_ops.not_equal(input_tensor, ignore_value), name='indices')
     return sparse_tensor_lib.SparseTensor(
-      indices=indices,
-      values=array_ops.gather_nd(input_tensor, indices, name='values'),
-      dense_shape=array_ops.shape(
-        input_tensor, out_type=dtypes.int64, name='dense_shape'))
+        indices=indices,
+        values=array_ops.gather_nd(input_tensor, indices, name='values'),
+        dense_shape=array_ops.shape(
+            input_tensor, out_type=dtypes.int64, name='dense_shape'))
 
 
 def _normalize_feature_columns(feature_columns):
@@ -2284,10 +2283,10 @@ def _normalize_feature_columns(feature_columns):
 
 
 class _NumericColumn(
-  _DenseColumn,
-  collections.namedtuple(
-    '_NumericColumn',
-    ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
+    _DenseColumn,
+    collections.namedtuple(
+        '_NumericColumn',
+        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
   """see `numeric_column`."""
 
   @property
@@ -2297,17 +2296,17 @@ def name(self):
   @property
   def _parse_example_spec(self):
     return {
-      self.key:
-        parsing_ops.FixedLenFeature(self.shape, self.dtype,
-                                    self.default_value)
+        self.key:
+            parsing_ops.FixedLenFeature(self.shape, self.dtype,
+                                        self.default_value)
     }
 
   def _transform_feature(self, inputs):
     input_tensor = inputs.get(self.key)
     if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
       raise ValueError(
-        'The corresponding Tensor of numerical column must be a Tensor. '
-        'SparseTensor is not supported. key: {}'.format(self.key))
+          'The corresponding Tensor of numerical column must be a Tensor. '
+          'SparseTensor is not supported. key: {}'.format(self.key))
     if self.normalizer_fn is not None:
       input_tensor = self.normalizer_fn(input_tensor)
     return math_ops.cast(input_tensor, dtypes.float32)
@@ -2359,23 +2358,23 @@ def _parse_example_spec(self):
   def _transform_feature(self, inputs):
     source_tensor = inputs.get(self.source_column)
     return math_ops._bucketize(  # pylint: disable=protected-access
-      source_tensor,
-      boundaries=self.boundaries)
+        source_tensor,
+        boundaries=self.boundaries)
 
   @property
   def _variable_shape(self):
     return tensor_shape.TensorShape(
-      tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
+        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     del weight_collections
     del trainable
     input_tensor = inputs.get(self)
     return array_ops.one_hot(
-      indices=math_ops.cast(input_tensor, dtypes.int64),
-      depth=len(self.boundaries) + 1,
-      on_value=1.,
-      off_value=0.)
+        indices=math_ops.cast(input_tensor, dtypes.int64),
+        depth=len(self.boundaries) + 1,
+        on_value=1.,
+        off_value=0.)
 
   @property
   def _num_buckets(self):
@@ -2393,9 +2392,9 @@ def _get_sparse_tensors(self,
     source_dimension = self.source_column.shape[0]
 
     i1 = array_ops.reshape(
-      array_ops.tile(
-        array_ops.expand_dims(math_ops.range(0, batch_size), 1),
-        [1, source_dimension]), (-1,))
+        array_ops.tile(
+            array_ops.expand_dims(math_ops.range(0, batch_size), 1),
+            [1, source_dimension]), (-1,))
     i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
     # Flatten the bucket indices and unique them across dimensions
     # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
@@ -2404,20 +2403,20 @@ def _get_sparse_tensors(self,
                           (-1,)) + (len(self.boundaries) + 1) * i2)
 
     indices = math_ops.cast(
-      array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
+        array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
     dense_shape = math_ops.cast(
-      array_ops.stack([batch_size, source_dimension]), dtypes.int64)
+        array_ops.stack([batch_size, source_dimension]), dtypes.int64)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
-      indices=indices, values=bucket_indices, dense_shape=dense_shape)
+        indices=indices, values=bucket_indices, dense_shape=dense_shape)
     return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
 
 
 class _EmbeddingColumn(
-  _DenseColumn, _SequenceDenseColumn,
-  collections.namedtuple(
-    '_EmbeddingColumn',
-    ('categorical_column', 'dimension', 'combiner', 'layer_creator',
-     'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
+    _DenseColumn, _SequenceDenseColumn,
+    collections.namedtuple(
+        '_EmbeddingColumn',
+        ('categorical_column', 'dimension', 'combiner', 'layer_creator',
+         'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
   """See `embedding_column`."""
 
   @property
@@ -2446,47 +2445,47 @@ def _get_dense_tensor_internal(self,
     """Private method that follows the signature of _get_dense_tensor."""
     # Get sparse IDs and weights.
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
-      inputs,
-      weight_collections=weight_collections,
-      trainable=trainable)
+        inputs,
+        weight_collections=weight_collections,
+        trainable=trainable)
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
     embedding_weights = self.layer_creator(
-      weight_collections=weight_collections,
-      scope=variable_scope.get_variable_scope())
+        weight_collections=weight_collections,
+        scope=variable_scope.get_variable_scope())
 
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
       if isinstance(to_restore, variables.PartitionedVariable):
         to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
       checkpoint_utils.init_from_checkpoint(
-        self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
+          self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
 
     # Return embedding lookup result.
     return embedding_ops.safe_embedding_lookup_sparse(
-      embedding_weights=embedding_weights,
-      sparse_ids=sparse_ids,
-      sparse_weights=sparse_weights,
-      combiner=self.combiner,
-      name='%s_weights' % self.name,
-      max_norm=self.max_norm)
+        embedding_weights=embedding_weights,
+        sparse_ids=sparse_ids,
+        sparse_weights=sparse_weights,
+        combiner=self.combiner,
+        name='%s_weights' % self.name,
+        max_norm=self.max_norm)
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-        'In embedding_column: {}. '
-        'categorical_column must not be of type _SequenceCategoricalColumn. '
-        'Suggested fix A: If you wish to use input_layer, use a '
-        'non-sequence categorical_column_with_*. '
-        'Suggested fix B: If you wish to create sequence input, use '
-        'sequence_input_layer instead of input_layer. '
-        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                     self.categorical_column))
+          'In embedding_column: {}. '
+          'categorical_column must not be of type _SequenceCategoricalColumn. '
+          'Suggested fix A: If you wish to use input_layer, use a '
+          'non-sequence categorical_column_with_*. '
+          'Suggested fix B: If you wish to create sequence input, use '
+          'sequence_input_layer instead of input_layer. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
     return self._get_dense_tensor_internal(
-      inputs=inputs,
-      weight_collections=weight_collections,
-      trainable=trainable)
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable)
 
   def _get_sequence_dense_tensor(self,
                                  inputs,
@@ -2494,22 +2493,22 @@ def _get_sequence_dense_tensor(self,
                                  trainable=None):
     if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-        'In embedding_column: {}. '
-        'categorical_column must be of type _SequenceCategoricalColumn '
-        'to use sequence_input_layer. '
-        'Suggested fix: Use one of sequence_categorical_column_with_*. '
-        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                     self.categorical_column))
+          'In embedding_column: {}. '
+          'categorical_column must be of type _SequenceCategoricalColumn '
+          'to use sequence_input_layer. '
+          'Suggested fix: Use one of sequence_categorical_column_with_*. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
     dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
-      inputs=inputs,
-      weight_collections=weight_collections,
-      trainable=trainable)
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable)
 
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     sequence_length = fc_utils.sequence_length_from_sparse_tensor(
-      sparse_tensors.id_tensor)
+        sparse_tensors.id_tensor)
     return _SequenceDenseColumn.TensorSequenceLengthPair(
-      dense_tensor=dense_tensor, sequence_length=sequence_length)
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
 
 
 def _get_graph_for_variable(var):
@@ -2520,13 +2519,13 @@ def _get_graph_for_variable(var):
 
 
 class _SharedEmbeddingColumn(
-  _DenseColumn, _SequenceDenseColumn,
-  collections.namedtuple(
-    '_SharedEmbeddingColumn',
-    ('categorical_column', 'dimension', 'combiner', 'initializer',
-     'shared_embedding_collection_name', 'ckpt_to_load_from',
-     'tensor_name_in_ckpt', 'max_norm', 'trainable', 'partitioner',
-     'ev_params'))):
+    _DenseColumn, _SequenceDenseColumn,
+    collections.namedtuple(
+        '_SharedEmbeddingColumn',
+        ('categorical_column', 'dimension', 'combiner', 'initializer',
+         'shared_embedding_collection_name', 'ckpt_to_load_from',
+         'tensor_name_in_ckpt', 'max_norm', 'trainable', 'partitioner',
+         'ev_params'))):
   """See `embedding_column`."""
 
   @property
@@ -2606,45 +2605,45 @@ def _get_dense_tensor_internal(self,
     with ops.name_scope(None, default_name=self.name):
       # Get sparse IDs and weights.
       sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
-        inputs,
-        weight_collections=weight_collections,
-        trainable=trainable)
+          inputs,
+          weight_collections=weight_collections,
+          trainable=trainable)
       sparse_ids = sparse_tensors.id_tensor
       sparse_weights = sparse_tensors.weight_tensor
 
       embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
       shared_embedding_collection = ops.get_collection(
-        self.shared_embedding_collection_name)
+          self.shared_embedding_collection_name)
       if shared_embedding_collection:
         if len(shared_embedding_collection) > 1:
           raise ValueError(
-            'Collection {} can only contain one variable. '
-            'Suggested fix A: Choose a unique name for this collection. '
-            'Suggested fix B: Do not add any variables to this collection. '
-            'The feature_column library already adds a variable under the '
-            'hood.'.format(shared_embedding_collection))
+              'Collection {} can only contain one variable. '
+              'Suggested fix A: Choose a unique name for this collection. '
+              'Suggested fix B: Do not add any variables to this collection. '
+              'The feature_column library already adds a variable under the '
+              'hood.'.format(shared_embedding_collection))
         embedding_weights = shared_embedding_collection[0]
         if embedding_weights.get_shape(
         ) != embedding_shape and not self.ev_params is not None:  # noqa : E714
           raise ValueError(
-            'Shared embedding collection {} contains variable {} of '
-            'unexpected shape {}. Expected shape is {}. '
-            'Suggested fix A: Choose a unique name for this collection. '
-            'Suggested fix B: Do not add any variables to this collection. '
-            'The feature_column library already adds a variable under the '
-            'hood.'.format(self.shared_embedding_collection_name,
-                           embedding_weights.name,
-                           embedding_weights.get_shape(), embedding_shape))
+              'Shared embedding collection {} contains variable {} of '
+              'unexpected shape {}. Expected shape is {}. '
+              'Suggested fix A: Choose a unique name for this collection. '
+              'Suggested fix B: Do not add any variables to this collection. '
+              'The feature_column library already adds a variable under the '
+              'hood.'.format(self.shared_embedding_collection_name,
+                             embedding_weights.name,
+                             embedding_weights.get_shape(), embedding_shape))
       else:
         if self.ev_params is None:
           embedding_weights = variable_scope.get_variable(
-            name='embedding_weights',
-            shape=embedding_shape,
-            dtype=dtypes.float32,
-            initializer=self.initializer,
-            trainable=self.trainable and trainable,
-            partitioner=self.partitioner,
-            collections=weight_collections)
+              name='embedding_weights',
+              shape=embedding_shape,
+              dtype=dtypes.float32,
+              initializer=self.initializer,
+              trainable=self.trainable and trainable,
+              partitioner=self.partitioner,
+              collections=weight_collections)
         else:
           # at eval or inference time, it is necessary to set
           # the initializers to zeros, so that new key will
@@ -2656,16 +2655,16 @@ def _get_dense_tensor_internal(self,
           else:
             initializer = self.initializer
           embedding_weights = variable_scope.get_embedding_variable(
-            name='embedding_weights',
-            embedding_dim=self.dimension,
-            initializer=initializer,
-            trainable=self.trainable and trainable,
-            partitioner=self.partitioner,
-            collections=weight_collections,
-            steps_to_live=self.ev_params.steps_to_live
-            if self.ev_params is not None else None,
-            filter_options=variables.CounterFilterOptions(
-              self.ev_params.filter_freq))
+              name='embedding_weights',
+              embedding_dim=self.dimension,
+              initializer=initializer,
+              trainable=self.trainable and trainable,
+              partitioner=self.partitioner,
+              collections=weight_collections,
+              steps_to_live=self.ev_params.steps_to_live
+              if self.ev_params is not None else None,
+              filter_options=variables.CounterFilterOptions(
+                  self.ev_params.filter_freq))
 
         ops.add_to_collection(self.shared_embedding_collection_name,
                               embedding_weights)
@@ -2674,41 +2673,41 @@ def _get_dense_tensor_internal(self,
         if isinstance(to_restore, variables.PartitionedVariable):
           to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
         checkpoint_utils.init_from_checkpoint(
-          self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
+            self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
 
       # Return embedding lookup result.
       if self.ev_params is not None:
         return ev_embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights=embedding_weights,
-          sparse_ids=sparse_ids,
-          sparse_weights=sparse_weights,
-          combiner=self.combiner,
-          name='%s_weights' % self.name,
-          max_norm=self.max_norm)
+            embedding_weights=embedding_weights,
+            sparse_ids=sparse_ids,
+            sparse_weights=sparse_weights,
+            combiner=self.combiner,
+            name='%s_weights' % self.name,
+            max_norm=self.max_norm)
       else:
         return embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights=embedding_weights,
-          sparse_ids=sparse_ids,
-          sparse_weights=sparse_weights,
-          combiner=self.combiner,
-          name='%s_weights' % self.name,
-          max_norm=self.max_norm)
+            embedding_weights=embedding_weights,
+            sparse_ids=sparse_ids,
+            sparse_weights=sparse_weights,
+            combiner=self.combiner,
+            name='%s_weights' % self.name,
+            max_norm=self.max_norm)
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-        'In embedding_column: {}. '
-        'categorical_column must not be of type _SequenceCategoricalColumn. '
-        'Suggested fix A: If you wish to use input_layer, use a '
-        'non-sequence categorical_column_with_*. '
-        'Suggested fix B: If you wish to create sequence input, use '
-        'sequence_input_layer instead of input_layer. '
-        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                     self.categorical_column))
+          'In embedding_column: {}. '
+          'categorical_column must not be of type _SequenceCategoricalColumn. '
+          'Suggested fix A: If you wish to use input_layer, use a '
+          'non-sequence categorical_column_with_*. '
+          'Suggested fix B: If you wish to create sequence input, use '
+          'sequence_input_layer instead of input_layer. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
     return self._get_dense_tensor_internal(
-      inputs=inputs,
-      weight_collections=weight_collections,
-      trainable=trainable)
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable)
 
   def _get_sequence_dense_tensor(self,
                                  inputs,
@@ -2716,21 +2715,21 @@ def _get_sequence_dense_tensor(self,
                                  trainable=None):
     if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-        'In embedding_column: {}. '
-        'categorical_column must be of type _SequenceCategoricalColumn '
-        'to use sequence_input_layer. '
-        'Suggested fix: Use one of sequence_categorical_column_with_*. '
-        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                     self.categorical_column))
+          'In embedding_column: {}. '
+          'categorical_column must be of type _SequenceCategoricalColumn '
+          'to use sequence_input_layer. '
+          'Suggested fix: Use one of sequence_categorical_column_with_*. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
     dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
-      inputs=inputs,
-      weight_collections=weight_collections,
-      trainable=trainable)
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable)
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     sequence_length = fc_utils.sequence_length_from_sparse_tensor(
-      sparse_tensors.id_tensor)
+        sparse_tensors.id_tensor)
     return _SequenceDenseColumn.TensorSequenceLengthPair(
-      dense_tensor=dense_tensor, sequence_length=sequence_length)
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
 
 
 def _check_shape(shape, key):
@@ -2751,8 +2750,8 @@ def _check_shape(shape, key):
 
 class _HashedCategoricalColumn(_CategoricalColumn,
                                collections.namedtuple(
-                                 '_HashedCategoricalColumn',
-                                 ['key', 'hash_bucket_size', 'dtype'])):
+                                   '_HashedCategoricalColumn',
+                                   ['key', 'hash_bucket_size', 'dtype'])):
   """see `categorical_column_with_hash_bucket`."""
 
   @property
@@ -2773,14 +2772,14 @@ def _transform_feature(self, inputs):
       raise ValueError('SparseColumn input must be a SparseTensor.')
 
     fc_utils.assert_string_or_int(
-      input_tensor.dtype,
-      prefix='column_name: {} input_tensor'.format(self.key))
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
 
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
-        'Column dtype and SparseTensors dtype must be compatible. '
-        'key: {}, column dtype: {}, tensor dtype: {}'.format(
-          self.key, self.dtype, input_tensor.dtype))
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
 
     if self.dtype == dtypes.string:
       sparse_values = input_tensor.values
@@ -2788,7 +2787,7 @@ def _transform_feature(self, inputs):
       sparse_values = string_ops.as_string(input_tensor.values)
 
     sparse_id_values = string_ops.string_to_hash_bucket_fast(
-      sparse_values, self.hash_bucket_size, name='lookup')
+        sparse_values, self.hash_bucket_size, name='lookup')
     return sparse_tensor_lib.SparseTensor(input_tensor.indices,
                                           sparse_id_values,
                                           input_tensor.dense_shape)
@@ -2806,10 +2805,10 @@ def _get_sparse_tensors(self,
 
 
 class _VocabularyFileCategoricalColumn(
-  _CategoricalColumn,
-  collections.namedtuple('_VocabularyFileCategoricalColumn',
-                         ('key', 'vocabulary_file', 'vocabulary_size',
-                          'num_oov_buckets', 'dtype', 'default_value'))):
+    _CategoricalColumn,
+    collections.namedtuple('_VocabularyFileCategoricalColumn',
+                           ('key', 'vocabulary_file', 'vocabulary_size',
+                            'num_oov_buckets', 'dtype', 'default_value'))):
   """See `categorical_column_with_vocabulary_file`."""
 
   @property
@@ -2825,13 +2824,13 @@ def _transform_feature(self, inputs):
 
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
-        'Column dtype and SparseTensors dtype must be compatible. '
-        'key: {}, column dtype: {}, tensor dtype: {}'.format(
-          self.key, self.dtype, input_tensor.dtype))
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
 
     fc_utils.assert_string_or_int(
-      input_tensor.dtype,
-      prefix='column_name: {} input_tensor'.format(self.key))
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
 
     key_dtype = self.dtype
     if input_tensor.dtype.is_integer:
@@ -2840,12 +2839,12 @@ def _transform_feature(self, inputs):
       input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     return lookup_ops.index_table_from_file(
-      vocabulary_file=self.vocabulary_file,
-      num_oov_buckets=self.num_oov_buckets,
-      vocab_size=self.vocabulary_size,
-      default_value=self.default_value,
-      key_dtype=key_dtype,
-      name='{}_lookup'.format(self.key)).lookup(input_tensor)
+        vocabulary_file=self.vocabulary_file,
+        num_oov_buckets=self.num_oov_buckets,
+        vocab_size=self.vocabulary_size,
+        default_value=self.default_value,
+        key_dtype=key_dtype,
+        name='{}_lookup'.format(self.key)).lookup(input_tensor)
 
   @property
   def _num_buckets(self):
@@ -2860,10 +2859,10 @@ def _get_sparse_tensors(self,
 
 
 class _VocabularyListCategoricalColumn(
-  _CategoricalColumn,
-  collections.namedtuple(
-    '_VocabularyListCategoricalColumn',
-    ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'))
+    _CategoricalColumn,
+    collections.namedtuple(
+        '_VocabularyListCategoricalColumn',
+        ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'))
 ):
   """See `categorical_column_with_vocabulary_list`."""
 
@@ -2880,13 +2879,13 @@ def _transform_feature(self, inputs):
 
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
-        'Column dtype and SparseTensors dtype must be compatible. '
-        'key: {}, column dtype: {}, tensor dtype: {}'.format(
-          self.key, self.dtype, input_tensor.dtype))
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
 
     fc_utils.assert_string_or_int(
-      input_tensor.dtype,
-      prefix='column_name: {} input_tensor'.format(self.key))
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
 
     key_dtype = self.dtype
     if input_tensor.dtype.is_integer:
@@ -2895,11 +2894,11 @@ def _transform_feature(self, inputs):
       input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     return lookup_ops.index_table_from_tensor(
-      vocabulary_list=tuple(self.vocabulary_list),
-      default_value=self.default_value,
-      num_oov_buckets=self.num_oov_buckets,
-      dtype=key_dtype,
-      name='{}_lookup'.format(self.key)).lookup(input_tensor)
+        vocabulary_list=tuple(self.vocabulary_list),
+        default_value=self.default_value,
+        num_oov_buckets=self.num_oov_buckets,
+        dtype=key_dtype,
+        name='{}_lookup'.format(self.key)).lookup(input_tensor)
 
   @property
   def _num_buckets(self):
@@ -2915,8 +2914,8 @@ def _get_sparse_tensors(self,
 
 class _IdentityCategoricalColumn(_CategoricalColumn,
                                  collections.namedtuple(
-                                   '_IdentityCategoricalColumn',
-                                   ('key', 'num_buckets', 'default_value'))):
+                                     '_IdentityCategoricalColumn',
+                                     ('key', 'num_buckets', 'default_value'))):
   """See `categorical_column_with_identity`."""
 
   @property
@@ -2932,37 +2931,37 @@ def _transform_feature(self, inputs):
 
     if not input_tensor.dtype.is_integer:
       raise ValueError('Invalid input, not integer. key: {} dtype: {}'.format(
-        self.key, input_tensor.dtype))
+          self.key, input_tensor.dtype))
 
     values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
     num_buckets = math_ops.cast(
-      self.num_buckets, dtypes.int64, name='num_buckets')
+        self.num_buckets, dtypes.int64, name='num_buckets')
     zero = math_ops.cast(0, dtypes.int64, name='zero')
     if self.default_value is None:
       # Fail if values are out-of-range.
       assert_less = check_ops.assert_less(
-        values,
-        num_buckets,
-        data=(values, num_buckets),
-        name='assert_less_than_num_buckets')
+          values,
+          num_buckets,
+          data=(values, num_buckets),
+          name='assert_less_than_num_buckets')
       assert_greater = check_ops.assert_greater_equal(
-        values, zero, data=(values,), name='assert_greater_or_equal_0')
+          values, zero, data=(values,), name='assert_greater_or_equal_0')
       with ops.control_dependencies((assert_less, assert_greater)):
         values = array_ops.identity(values)
     else:
       # Assign default for out-of-range values.
       values = array_ops.where(
-        math_ops.logical_or(
-          values < zero, values >= num_buckets, name='out_of_range'),
-        array_ops.fill(
-          dims=array_ops.shape(values),
-          value=math_ops.cast(self.default_value, dtypes.int64),
-          name='default_values'), values)
+          math_ops.logical_or(
+              values < zero, values >= num_buckets, name='out_of_range'),
+          array_ops.fill(
+              dims=array_ops.shape(values),
+              value=math_ops.cast(self.default_value, dtypes.int64),
+              name='default_values'), values)
 
     return sparse_tensor_lib.SparseTensor(
-      indices=input_tensor.indices,
-      values=values,
-      dense_shape=input_tensor.dense_shape)
+        indices=input_tensor.indices,
+        values=values,
+        dense_shape=input_tensor.dense_shape)
 
   @property
   def _num_buckets(self):
@@ -2977,10 +2976,10 @@ def _get_sparse_tensors(self,
 
 
 class _WeightedCategoricalColumn(
-  _CategoricalColumn,
-  collections.namedtuple(
-    '_WeightedCategoricalColumn',
-    ('categorical_column', 'weight_feature_key', 'dtype'))):
+    _CategoricalColumn,
+    collections.namedtuple(
+        '_WeightedCategoricalColumn',
+        ('categorical_column', 'weight_feature_key', 'dtype'))):
   """See `weighted_categorical_column`."""
 
   @property
@@ -2993,7 +2992,7 @@ def _parse_example_spec(self):
     config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
     if self.weight_feature_key in config:
       raise ValueError('Parse config {} already exists for {}.'.format(
-        config[self.weight_feature_key], self.weight_feature_key))
+          config[self.weight_feature_key], self.weight_feature_key))
     config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
     return config
 
@@ -3006,14 +3005,14 @@ def _transform_feature(self, inputs):
     if weight_tensor is None:
       raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
     weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
-      weight_tensor)
+        weight_tensor)
     if self.dtype != weight_tensor.dtype.base_dtype:
       raise ValueError('Bad dtype, expected {}, but got {}.'.format(
-        self.dtype, weight_tensor.dtype))
+          self.dtype, weight_tensor.dtype))
     if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
       # The weight tensor can be a regular Tensor. In this case, sparsify it.
       weight_tensor = _to_sparse_input_and_drop_ignore_values(
-        weight_tensor, ignore_value=0.0)
+          weight_tensor, ignore_value=0.0)
     if not weight_tensor.dtype.is_floating:
       weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
     return (inputs.get(self.categorical_column), weight_tensor)
@@ -3029,9 +3028,9 @@ def _get_sparse_tensors(self,
 
 
 class _CrossedColumn(
-  _CategoricalColumn,
-  collections.namedtuple('_CrossedColumn',
-                         ['keys', 'hash_bucket_size', 'hash_key'])):
+    _CategoricalColumn,
+    collections.namedtuple('_CrossedColumn',
+                           ['keys', 'hash_bucket_size', 'hash_key'])):
   """See `crossed_column`."""
 
   @property
@@ -3063,16 +3062,16 @@ def _transform_feature(self, inputs):
         ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
         if ids_and_weights.weight_tensor is not None:
           raise ValueError(
-            'crossed_column does not support weight_tensor, but the given '
-            'column populates weight_tensor. '
-            'Given column: {}'.format(key.name))
+              'crossed_column does not support weight_tensor, but the given '
+              'column populates weight_tensor. '
+              'Given column: {}'.format(key.name))
         feature_tensors.append(ids_and_weights.id_tensor)
       else:
         raise ValueError('Unsupported column type. Given: {}'.format(key))
     return sparse_ops.sparse_cross_hashed(
-      inputs=feature_tensors,
-      num_buckets=self.hash_bucket_size,
-      hash_key=self.hash_key)
+        inputs=feature_tensors,
+        num_buckets=self.hash_bucket_size,
+        hash_key=self.hash_key)
 
   @property
   def _num_buckets(self):
@@ -3137,9 +3136,9 @@ def _transform_feature(self, inputs):
     # If the underlying column is weighted, return the input as a dense tensor.
     if weight_tensor is not None:
       weighted_column = sparse_ops.sparse_merge(
-        sp_ids=id_tensor,
-        sp_values=weight_tensor,
-        vocab_size=int(self._variable_shape[-1]))
+          sp_ids=id_tensor,
+          sp_values=weight_tensor,
+          vocab_size=int(self._variable_shape[-1]))
       # Remove (?, -1) index.
       weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
                                                 weighted_column.dense_shape)
@@ -3150,15 +3149,15 @@ def _transform_feature(self, inputs):
                                   weighted_column.dense_shape)
 
     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
-      id_tensor, default_value=-1)
+        id_tensor, default_value=-1)
 
     # One hot must be float for tf.concat reasons since all other inputs to
     # input_layer are float32.
     one_hot_id_tensor = array_ops.one_hot(
-      dense_id_tensor,
-      depth=self._variable_shape[-1],
-      on_value=1.0,
-      off_value=0.0)
+        dense_id_tensor,
+        depth=self._variable_shape[-1],
+        on_value=1.0,
+        off_value=0.0)
 
     # Reduce to get a multi-hot per example.
     return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
@@ -3194,14 +3193,14 @@ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     del trainable
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-        'In indicator_column: {}. '
-        'categorical_column must not be of type _SequenceCategoricalColumn. '
-        'Suggested fix A: If you wish to use input_layer, use a '
-        'non-sequence categorical_column_with_*. '
-        'Suggested fix B: If you wish to create sequence input, use '
-        'sequence_input_layer instead of input_layer. '
-        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                     self.categorical_column))
+          'In indicator_column: {}. '
+          'categorical_column must not be of type _SequenceCategoricalColumn. '
+          'Suggested fix A: If you wish to use input_layer, use a '
+          'non-sequence categorical_column_with_*. '
+          'Suggested fix B: If you wish to create sequence input, use '
+          'sequence_input_layer instead of input_layer. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
     # Feature has been already transformed. Return the intermediate
     # representation created by _transform_feature.
     return inputs.get(self)
@@ -3216,20 +3215,20 @@ def _get_sequence_dense_tensor(self,
     del trainable
     if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
-        'In indicator_column: {}. '
-        'categorical_column must be of type _SequenceCategoricalColumn '
-        'to use sequence_input_layer. '
-        'Suggested fix: Use one of sequence_categorical_column_with_*. '
-        'Given (type {}): {}'.format(self.name, type(self.categorical_column),
-                                     self.categorical_column))
+          'In indicator_column: {}. '
+          'categorical_column must be of type _SequenceCategoricalColumn '
+          'to use sequence_input_layer. '
+          'Suggested fix: Use one of sequence_categorical_column_with_*. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
     # Feature has been already transformed. Return the intermediate
     # representation created by _transform_feature.
     dense_tensor = inputs.get(self)
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     sequence_length = fc_utils.sequence_length_from_sparse_tensor(
-      sparse_tensors.id_tensor)
+        sparse_tensors.id_tensor)
     return _SequenceDenseColumn.TensorSequenceLengthPair(
-      dense_tensor=dense_tensor, sequence_length=sequence_length)
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
 
 
 def _verify_static_batch_size_equality(tensors, columns):
@@ -3252,16 +3251,16 @@ def _verify_static_batch_size_equality(tensors, columns):
         expected_batch_size = tensors[i].shape.dims[0]
       elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]):
         raise ValueError(
-          'Batch size (first dimension) of each feature must be same. '
-          'Batch size of columns ({}, {}): ({}, {})'.format(
-            columns[bath_size_column_index].name, columns[i].name,
-            expected_batch_size, tensors[i].shape.dims[0]))
+            'Batch size (first dimension) of each feature must be same. '
+            'Batch size of columns ({}, {}): ({}, {})'.format(
+                columns[bath_size_column_index].name, columns[i].name,
+                expected_batch_size, tensors[i].shape.dims[0]))
 
 
 class _SequenceCategoricalColumn(_CategoricalColumn,
                                  collections.namedtuple(
-                                   '_SequenceCategoricalColumn',
-                                   ['categorical_column'])):
+                                     '_SequenceCategoricalColumn',
+                                     ['categorical_column'])):
   """Represents sequences of categorical data."""
 
   @property
diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index fa604926d..82d42508c 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -5,15 +5,20 @@
 import tensorflow as tf
 
 from easy_rec.python.layers import dnn
-from easy_rec.python.layers.common_layers import SENet, EnhancedInputLayer
-from easy_rec.python.layers.common_layers import highway, Concatenate
+from easy_rec.python.layers.common_layers import Concatenate
+from easy_rec.python.layers.common_layers import EnhancedInputLayer
+from easy_rec.python.layers.common_layers import SENet
+from easy_rec.python.layers.common_layers import highway
 from easy_rec.python.layers.fibinet import FiBiNetLayer
-from easy_rec.python.layers.fm import FM, FMLayer
+from easy_rec.python.layers.fm import FMLayer
 from easy_rec.python.layers.mask_net import MaskNet
 from easy_rec.python.layers.numerical_embedding import AutoDisEmbedding
 from easy_rec.python.layers.numerical_embedding import PeriodicEmbedding
+from easy_rec.python.protos import backbone_pb2
+from easy_rec.python.protos import layer_pb2
 from easy_rec.python.utils.dag import DAG
-from easy_rec.python.utils.tf_utils import add_op, dot_op
+from easy_rec.python.utils.tf_utils import add_op
+from easy_rec.python.utils.tf_utils import dot_op
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -29,24 +34,67 @@ def __init__(self, config, model, features, input_layer, l2_reg=None):
     self._l2_reg = l2_reg
     self._dag = DAG()
     self._name_to_blocks = {}
+    input_feature_groups = set()
     for block in config.blocks:
-      self._name_to_blocks[block.name] = block
       self._dag.add_node(block.name)
-    num_blocks = len(self._name_to_blocks)
+      self._name_to_blocks[block.name] = block
+      layer = block.WhichOneof('layer')
+      if layer == 'input_layer':
+        if len(block.inputs) != 0:
+          raise ValueError('no input allowed for input_layer: ' + block.name)
+        input_name = block.name
+        if input_name in input_feature_groups:
+          raise ValueError('input `%s` already exists in other block' %
+                           input_name)
+        else:
+          input_feature_groups.add(input_name)
+
+    num_groups = len(input_feature_groups)
+    num_blocks = len(self._name_to_blocks) - num_groups
     assert num_blocks > 0, 'there must be at least one block in backbone'
+
     for block in config.blocks:
+      layer = block.WhichOneof('layer')
+      if layer == 'input_layer':
+        continue
+      if block.name in input_feature_groups:
+        raise KeyError('block name can not be one of feature groups:' +
+                       block.name)
       assert len(block.inputs) > 0, 'no input for block: %s' % block.name
-      for node in block.inputs:
-        if node in self._name_to_blocks:
-          self._dag.add_edge(node, block.name)
+
+      for input_node in block.inputs:
+        input_name = input_node.name
+        if input_name in self._name_to_blocks:
+          assert input_name != block.name, 'input name can not equal to block name:' + input_name
+          self._dag.add_edge(input_name, block.name)
+        elif input_name not in input_feature_groups:
+          if input_layer.has_group(input_name):
+            logging.info('adding an input_layer block: ' + input_name)
+            new_block = backbone_pb2.Block()
+            new_block.name = input_name
+            new_block.input_layer.CopyFrom(layer_pb2.InputLayer())
+            self._name_to_blocks[input_name] = new_block
+            self._dag.add_node(input_name)
+            self._dag.add_edge(input_name, block.name)
+            input_feature_groups.add(block.name)
+          else:
+            raise KeyError(
+                'invalid input name `%s`, must be the name of either a feature group or an another block'
+                % input_name)
+    num_groups = len(input_feature_groups)
+    assert num_groups > 0, 'there must be at least one input layer'
 
   def block_input(self, config, block_outputs, output_list=False):
     inputs = []
-    for input_name in config.inputs:
+    for input_node in config.inputs:
+      input_name = input_node.name
       if input_name in block_outputs:
         input_feature = block_outputs[input_name]
       else:
-        input_feature, _ = self._input_layer(self._features, input_name)
+        raise KeyError('input name `%s` does not exists' % input_name)
+      if input_node.HasField('input_fn'):
+        fn = eval(input_node.input_fn)
+        input_feature = fn(input_feature)
       inputs.append(input_feature)
 
     if output_list:
@@ -67,14 +115,12 @@ def __call__(self, is_training, *args, **kwargs):
     for block in blocks:
       config = self._name_to_blocks[block]
       layer = config.WhichOneof('layer')
-      if layer == 'input_layer':
-        if len(config.inputs) != 1:
-          raise ValueError('only one input allowed for input_layer: ' +
-                           block.name)
+      if layer is None:  # identity layer
+        block_outputs[block] = self.block_input(config, block_outputs)
+      elif layer == 'input_layer':
         conf = config.input_layer
-        input_layer = EnhancedInputLayer(conf, self._input_layer,
-                                         self._features)
-        output = input_layer(config.inputs[0], is_training)
+        input_fn = EnhancedInputLayer(conf, self._input_layer, self._features)
+        output = input_fn(block, is_training)
         block_outputs[block] = output
       elif layer == 'periodic_embedding':
         input_feature = self.block_input(config, block_outputs)
@@ -131,9 +177,11 @@ def __call__(self, is_training, *args, **kwargs):
         block_outputs[block] = concat(input_feature)
       elif layer == 'reshape':
         input_feature = self.block_input(config, block_outputs)
-        block_outputs[block] = tf.reshape(input_feature, list(config.reshape.dims))
+        block_outputs[block] = tf.reshape(input_feature,
+                                          list(config.reshape.dims))
       elif layer == 'add':
-        input_feature = self.block_input(config, block_outputs, output_list=True)
+        input_feature = self.block_input(
+            config, block_outputs, output_list=True)
         block_outputs[block] = add_op(input_feature)
       elif layer == 'dot':
         input_feature = self.block_input(config, block_outputs)
@@ -142,9 +190,9 @@ def __call__(self, is_training, *args, **kwargs):
         input_feature = self.block_input(config, block_outputs)
         fn = eval(config.Lambda.expression)
         block_outputs[block] = fn(input_feature)
-      elif layer == 'chain':
-        input_feature = self.block_input(config, block_outputs)
-        block_outputs[block] = op_chain(input_feature, config.chain.ops)
+      # elif layer == 'chain':
+      #   input_feature = self.block_input(config, block_outputs)
+      #   block_outputs[block] = op_chain(input_feature, config.chain.ops)
       else:
         raise NotImplementedError('Unsupported backbone layer:' + layer)
 
@@ -154,8 +202,8 @@ def __call__(self, is_training, *args, **kwargs):
         temp.append(block_outputs[output])
       else:
         raise ValueError('No output `%s` of backbone to be concat' % output)
-
     output = concat_inputs(temp, msg='backbone')
+
     if self._config.HasField('top_mlp'):
       no_act = self._config.top_mlp.last_layer_no_activation
       no_bn = self._config.top_mlp.last_layer_no_batch_norm
@@ -202,66 +250,66 @@ def concat_inputs(inputs, axis=-1, msg=''):
   raise ValueError('no inputs to be concat:' + msg)
 
 
-def op_chain(inputs, ops):
-  output = inputs
-  for op in ops:
-    op_name = op.WhichOneOf('Op')
-    output = run_op(output, op_name, op, block='op_chain')
-  return output
-
-
-def run_op(inputs, op_name, config, block='', is_training=False, l2_reg=None):
-  if op_name == 'periodic_embedding':
-    num_emb = PeriodicEmbedding(config.periodic_embedding, scope=block)
-    return num_emb(inputs)
-  elif op_name == 'auto_dis_embedding':
-    num_emb = AutoDisEmbedding(config.auto_dis_embedding, scope=block)
-    return num_emb(inputs)
-  elif op_name == 'highway':
-    conf = config.highway
-    highway_op_name = highway(
-      inputs,
-      conf.emb_size,
-      activation=conf.activation,
-      dropout=conf.dropout_rate,
-      scope=block)
-    return highway_op_name(inputs)
-  elif op_name == 'mlp':
-    mlp = dnn.DNN(
-      config.mlp,
-      l2_reg,
-      name='%s_mlp' % block,
-      is_training=is_training,
-      last_layer_no_activation=config.mlp.last_layer_no_activation,
-      last_layer_no_batch_norm=config.mlp.last_layer_no_batch_norm)
-    return mlp(inputs)
-  elif op_name == 'masknet':
-    mask_net = MaskNet(config.masknet, name=block, reuse=tf.AUTO_REUSE)
-    output = mask_net(inputs, is_training, l2_reg=l2_reg)
-    return output
-  elif op_name == 'senet':
-    senet = SENet(config.senet, name=block)
-    output = senet(inputs)
-    return output
-  elif op_name == 'fibinet':
-    fibinet = FiBiNetLayer(config.fibinet, name=block)
-    output = fibinet(inputs, is_training, l2_reg=l2_reg)
-    return output
-  elif op_name == 'fm':
-    fm = FMLayer(config.fm, name=block)
-    return fm(inputs)
-  if op_name == 'Lambda':
-    fn = eval(config.Lambda.expression)
-    output = fn(inputs)
-  elif op_name == 'concat':
-    concat = Concatenate(config.concat)
-    output = concat(inputs)
-  elif op_name == 'reshape':
-    output = tf.reshape(inputs, list(config.reshape.dims))
-  elif op_name == 'add':
-    output = add_op(inputs)
-  elif op_name == 'dot':
-    output = dot_op(inputs)
-  else:
-    raise NotImplementedError('Unsupported op:' + op_name)
-  return output
+# def op_chain(inputs, ops):
+#  output = inputs
+#  for op in ops:
+#    op_name = op.WhichOneOf('Op')
+#    output = run_op(output, op_name, op, block='op_chain')
+#  return output
+#
+#
+# def run_op(inputs, op_name, config, block='', is_training=False, l2_reg=None):
+#  if op_name == 'periodic_embedding':
+#    num_emb = PeriodicEmbedding(config.periodic_embedding, scope=block)
+#    return num_emb(inputs)
+#  elif op_name == 'auto_dis_embedding':
+#    num_emb = AutoDisEmbedding(config.auto_dis_embedding, scope=block)
+#    return num_emb(inputs)
+#  elif op_name == 'highway':
+#    conf = config.highway
+#    highway_op_name = highway(
+#      inputs,
+#      conf.emb_size,
+#      activation=conf.activation,
+#      dropout=conf.dropout_rate,
+#      scope=block)
+#    return highway_op_name(inputs)
+#  elif op_name == 'mlp':
+#    mlp = dnn.DNN(
+#      config.mlp,
+#      l2_reg,
+#      name='%s_mlp' % block,
+#      is_training=is_training,
+#      last_layer_no_activation=config.mlp.last_layer_no_activation,
+#      last_layer_no_batch_norm=config.mlp.last_layer_no_batch_norm)
+#    return mlp(inputs)
+#  elif op_name == 'masknet':
+#    mask_net = MaskNet(config.masknet, name=block, reuse=tf.AUTO_REUSE)
+#    output = mask_net(inputs, is_training, l2_reg=l2_reg)
+#    return output
+#  elif op_name == 'senet':
+#    senet = SENet(config.senet, name=block)
+#    output = senet(inputs)
+#    return output
+#  elif op_name == 'fibinet':
+#    fibinet = FiBiNetLayer(config.fibinet, name=block)
+#    output = fibinet(inputs, is_training, l2_reg=l2_reg)
+#    return output
+#  elif op_name == 'fm':
+#    fm = FMLayer(config.fm, name=block)
+#    return fm(inputs)
+#  if op_name == 'Lambda':
+#    fn = eval(config.Lambda.expression)
+#    output = fn(inputs)
+#  elif op_name == 'concat':
+#    concat = Concatenate(config.concat)
+#    output = concat(inputs)
+#  elif op_name == 'reshape':
+#    output = tf.reshape(inputs, list(config.reshape.dims))
+#  elif op_name == 'add':
+#    output = add_op(inputs)
+#  elif op_name == 'dot':
+#    output = dot_op(inputs)
+#  else:
+#    raise NotImplementedError('Unsupported op:' + op_name)
+#  return output
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index a453141f9..f06723f68 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -82,6 +82,7 @@ def layer_norm(input_tensor, name=None, reuse=None):
 
 
 class EnhancedInputLayer(object):
+  """Enhance the raw input layer."""
 
   def __init__(self, config, input_layer, feature_dict):
     if config.do_batch_norm and config.do_layer_norm:
@@ -92,56 +93,49 @@ def __init__(self, config, input_layer, feature_dict):
     self._input_layer = input_layer
     self._feature_dict = feature_dict
 
-  def __call__(self, feature_group, is_training, *args, **kwargs):
-    features, feature_list = self._input_layer(self._feature_dict,
-                                               feature_group)
+  def __call__(self, group, is_training, *args, **kwargs):
+    features, feature_list = self._input_layer(self._feature_dict, group)
     num_features = len(feature_list)
 
-    do_feature_dropout = 0.0 < self._config.feature_dropout_rate < 1.0
-    if self._config.output_feature_list or do_feature_dropout:
-      if self._config.do_layer_norm or self._config.do_batch_norm:
-        for i in range(num_features):
-          fea = feature_list[i]
-          if self._config.do_batch_norm:
-            fea = tf.layers.batch_normalization(fea, training=is_training)
-          elif self._config.do_layer_norm:
-            fea = layer_norm(fea)
-          feature_list[i] = fea
-    elif self._config.do_batch_norm:
-      features = tf.layers.batch_normalization(features, training=is_training)
-    elif self._config.do_layer_norm:
-      features = layer_norm(features)
-
-    if do_feature_dropout and is_training:
+    do_ln = self._config.do_layer_norm
+    do_bn = self._config.do_batch_norm
+    do_feature_dropout = is_training and 0.0 < self._config.feature_dropout_rate < 1.0
+    if do_feature_dropout:
       keep_prob = 1.0 - self._config.feature_dropout_rate
       bern = tf.distributions.Bernoulli(probs=keep_prob)
       mask = bern.sample(num_features)
-      for i in range(num_features):
-        fea = tf.div(feature_list[i], keep_prob) * mask[i]
-        feature_list[i] = fea
-      features = tf.concat(feature_list, axis=-1)
+    elif do_bn:
+      features = tf.layers.batch_normalization(features, training=is_training)
+    elif do_ln:
+      features = layer_norm(features)
 
     do_dropout = 0.0 < self._config.dropout_rate < 1.0
-    if self._config.output_feature_list:
-      if do_dropout:
-        for i in range(num_features):
-          fea = feature_list[i]
+    if do_feature_dropout or do_ln or do_bn or do_dropout:
+      for i in range(num_features):
+        fea = feature_list[i]
+        if self._config.do_batch_norm:
+          fea = tf.layers.batch_normalization(fea, training=is_training)
+        elif self._config.do_layer_norm:
+          fea = layer_norm(fea)
+        if do_dropout:
           fea = tf.layers.dropout(
               fea, self._config.dropout_rate, training=is_training)
-          feature_list[i] = fea
-      if self._config.output_3d_tensor:
-        for i in range(num_features):
-          feature_list[i] = tf.expand_dims(feature_list[i], axis=1)
-        return tf.concat(feature_list, axis=1)
-      return feature_list
+        if do_feature_dropout:
+          fea = tf.div(fea, keep_prob) * mask[i]
+        feature_list[i] = fea
+      if do_feature_dropout:
+        features = tf.concat(feature_list, axis=-1)
 
-    if do_dropout:
+    if do_dropout and not do_feature_dropout:
       features = tf.layers.dropout(
           features, self._config.dropout_rate, training=is_training)
 
-    if self._config.output_3d_tensor:
-      dim = int(feature_list[0].shape[-1])
-      return tf.reshape(features, [-1, num_features, dim])
+    if self._config.only_output_feature_list:
+      return feature_list
+    if self._config.only_output_3d_tensor:
+      return tf.stack(feature_list, axis=1)
+    if self._config.output_2d_tensor_and_feature_list:
+      return features, feature_list
     return features
 
 
diff --git a/easy_rec/python/layers/fm.py b/easy_rec/python/layers/fm.py
index 87d621d57..7b0742f6d 100644
--- a/easy_rec/python/layers/fm.py
+++ b/easy_rec/python/layers/fm.py
@@ -32,6 +32,7 @@ class FMLayer(object):
   References
     - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
   """
+
   def __init__(self, config, name='fm'):
     self.name = name
     self.config = config
@@ -59,8 +60,8 @@ def __call__(self, inputs):
 
     with tf.name_scope(self.name):
       square_of_sum = tf.square(tf.reduce_sum(fea, axis=1))
-      sum_of_square = tf.reduce_sum(fea * fea, axis=1)
-      cross_term = square_of_sum - sum_of_square
+      sum_of_square = tf.reduce_sum(tf.square(fea), axis=1)
+      cross_term = tf.subtract(square_of_sum, sum_of_square)
       if self.config.use_variant:
         cross_term = 0.5 * cross_term
       else:
diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py
index 33cd681ad..df1a17b25 100644
--- a/easy_rec/python/layers/input_layer.py
+++ b/easy_rec/python/layers/input_layer.py
@@ -17,9 +17,10 @@
 from easy_rec.python.layers.common_layers import text_cnn
 from easy_rec.python.layers.fscd_layer import FSCDLayer
 from easy_rec.python.protos.feature_config_pb2 import WideOrDeep
-from easy_rec.python.utils import shape_utils, conditional
+from easy_rec.python.utils import conditional
+from easy_rec.python.utils import shape_utils
 
-from easy_rec.python.compat.feature_column.feature_column_v2 import is_embedding_column
+from easy_rec.python.compat.feature_column.feature_column_v2 import is_embedding_column  # NOQA
 
 
 class InputLayer(object):
@@ -97,7 +98,7 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
     if is_combine:
       with conditional(self._is_predicting, ops.device('/CPU:0')):
         concat_features, group_features = self.single_call_input_layer(
-          features, group_name, feature_name_to_output_tensors)
+            features, group_name, feature_name_to_output_tensors)
       if group_name in self._group_name_to_seq_features:
         # for target attention
         group_seq_arr = self._group_name_to_seq_features[group_name]
diff --git a/easy_rec/python/layers/keras/__init__.py b/easy_rec/python/layers/keras/__init__.py
new file mode 100644
index 000000000..c4006b39c
--- /dev/null
+++ b/easy_rec/python/layers/keras/__init__.py
@@ -0,0 +1 @@
+from .dot_interaction import DotInteraction
diff --git a/easy_rec/python/layers/keras/dcn.py b/easy_rec/python/layers/keras/dcn.py
new file mode 100644
index 000000000..2f35bdc5d
--- /dev/null
+++ b/easy_rec/python/layers/keras/dcn.py
@@ -0,0 +1,182 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""Implements `Cross` Layer, the cross layer in Deep & Cross Network (DCN)."""
+
+import tensorflow as tf
+
+
+class Cross(tf.keras.layers.Layer):
+  """Cross Layer in Deep & Cross Network to learn explicit feature interactions.
+
+    A layer that creates explicit and bounded-degree feature interactions
+    efficiently. The `call` method accepts `inputs` as a tuple of size 2
+    tensors. The first input `x0` is the base layer that contains the original
+    features (usually the embedding layer); the second input `xi` is the output
+    of the previous `Cross` layer in the stack, i.e., the i-th `Cross`
+    layer. For the first `Cross` layer in the stack, x0 = xi.
+
+    The output is x_{i+1} = x0 .* (W * xi + bias + diag_scale * xi) + xi,
+    where .* designates elementwise multiplication, W could be a full-rank
+    matrix, or a low-rank matrix U*V to reduce the computational cost, and
+    diag_scale increases the diagonal of W to improve training stability (
+    especially for the low-rank case).
+
+    References:
+        1. [R. Wang et al.](https://arxiv.org/pdf/2008.13535.pdf)
+          See Eq. (1) for full-rank and Eq. (2) for low-rank version.
+        2. [R. Wang et al.](https://arxiv.org/pdf/1708.05123.pdf)
+
+    Example:
+
+        ```python
+        # after embedding layer in a functional model:
+        input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64)
+        x0 = tf.keras.layers.Embedding(input_dim=32, output_dim=6)
+        x1 = Cross()(x0, x0)
+        x2 = Cross()(x0, x1)
+        logits = tf.keras.layers.Dense(units=10)(x2)
+        model = tf.keras.Model(input, logits)
+        ```
+
+    Args:
+        projection_dim: project dimension to reduce the computational cost.
+          Default is `None` such that a full (`input_dim` by `input_dim`) matrix
+          W is used. If enabled, a low-rank matrix W = U*V will be used, where U
+          is of size `input_dim` by `projection_dim` and V is of size
+          `projection_dim` by `input_dim`. `projection_dim` need to be smaller
+          than `input_dim`/2 to improve the model efficiency. In practice, we've
+          observed that `projection_dim` = d/4 consistently preserved the
+          accuracy of a full-rank version.
+        diag_scale: a non-negative float used to increase the diagonal of the
+          kernel W by `diag_scale`, that is, W + diag_scale * I, where I is an
+          identity matrix.
+        use_bias: whether to add a bias term for this layer. If set to False,
+          no bias term will be used.
+        preactivation: Activation applied to output matrix of the layer, before
+          multiplication with the input. Can be used to control the scale of the
+          layer's outputs and improve stability.
+        kernel_initializer: Initializer to use on the kernel matrix.
+        bias_initializer: Initializer to use on the bias vector.
+        kernel_regularizer: Regularizer to use on the kernel matrix.
+        bias_regularizer: Regularizer to use on bias vector.
+
+    Input shape: A tuple of 2 (batch_size, `input_dim`) dimensional inputs.
+    Output shape: A single (batch_size, `input_dim`) dimensional output.
+  """
+
+  def __init__(self, config, **kwargs):
+    super(Cross, self).__init__(**kwargs)
+    self._projection_dim = config.projection_dim
+    self._diag_scale = config.diag_scale
+    self._use_bias = config.use_bias
+    self._preactivation = tf.keras.activations.get(config.preactivation)
+    self._kernel_initializer = tf.keras.initializers.get(config.kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(config.bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(config.kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(config.bias_regularizer)
+    self._input_dim = None
+    self._supports_masking = True
+
+    if self._diag_scale < 0:  # pytype: disable=unsupported-operands
+      raise ValueError(
+          "`diag_scale` should be non-negative. Got `diag_scale` = {}".format(
+              self._diag_scale))
+
+  def build(self, input_shape):
+    last_dim = input_shape[-1]
+
+    if self._projection_dim is None:
+      self._dense = tf.keras.layers.Dense(
+          last_dim,
+          kernel_initializer=_clone_initializer(self._kernel_initializer),
+          bias_initializer=self._bias_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          use_bias=self._use_bias,
+          dtype=self.dtype,
+          activation=self._preactivation,
+      )
+    else:
+      self._dense_u = tf.keras.layers.Dense(
+          self._projection_dim,
+          kernel_initializer=_clone_initializer(self._kernel_initializer),
+          kernel_regularizer=self._kernel_regularizer,
+          use_bias=False,
+          dtype=self.dtype,
+      )
+      self._dense_v = tf.keras.layers.Dense(
+          last_dim,
+          kernel_initializer=_clone_initializer(self._kernel_initializer),
+          bias_initializer=self._bias_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          use_bias=self._use_bias,
+          dtype=self.dtype,
+          activation=self._preactivation,
+      )
+    self.built = True
+
+  def call(self, inputs, **kwargs):
+    """Computes the feature cross.
+
+    Args:
+      inputs: The input tensor(x0, x)
+      - x0: The input tensor
+      - x: Optional second input tensor. If provided, the layer will compute
+        crosses between x0 and x; if not provided, the layer will compute
+        crosses between x0 and itself.
+
+    Returns:
+     Tensor of crosses.
+    """
+    if isinstance(inputs, (list, tuple)):
+      x0, x = inputs
+    else:
+      x0, x = inputs, inputs
+
+    if not self.built:
+      self.build(x0.shape)
+
+    if x0.shape[-1] != x.shape[-1]:
+      raise ValueError(
+          "`x0` and `x` dimension mismatch! Got `x0` dimension {}, and x "
+          "dimension {}. This case is not supported yet.".format(
+              x0.shape[-1], x.shape[-1]))
+
+    if self._projection_dim is None:
+      prod_output = self._dense(x)
+    else:
+      prod_output = self._dense_v(self._dense_u(x))
+
+    prod_output = tf.cast(prod_output, self.compute_dtype)
+
+    if self._diag_scale:
+      prod_output = prod_output + self._diag_scale * x
+
+    return x0 * prod_output + x
+
+  def get_config(self):
+    config = {
+        "projection_dim":
+            self._projection_dim,
+        "diag_scale":
+            self._diag_scale,
+        "use_bias":
+            self._use_bias,
+        "preactivation":
+            tf.keras.activations.serialize(self._preactivation),
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+    }
+    base_config = super(Cross, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+def _clone_initializer(initializer):
+  return initializer.__class__.from_config(initializer.get_config())
diff --git a/easy_rec/python/layers/keras/dot_interaction.py b/easy_rec/python/layers/keras/dot_interaction.py
new file mode 100644
index 000000000..50a3966af
--- /dev/null
+++ b/easy_rec/python/layers/keras/dot_interaction.py
@@ -0,0 +1,92 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""Implements `Dot Interaction` Layer of DLRM model."""
+
+import tensorflow as tf
+
+
+class DotInteraction(tf.keras.layers.Layer):
+  """Dot interaction layer.
+
+  See theory in the DLRM paper: https://arxiv.org/pdf/1906.00091.pdf,
+  section 2.1.3. Sparse activations and dense activations are combined.
+  Dot interaction is applied to a batch of input Tensors [e1,...,e_k] of the
+  same dimension and the output is a batch of Tensors with all distinct pairwise
+  dot products of the form dot(e_i, e_j) for i <= j if self self_interaction is
+  True, otherwise dot(e_i, e_j) i < j.
+
+  Attributes:
+    self_interaction: Boolean indicating if features should self-interact.
+      If it is True, then the diagonal entries of the interaction metric are
+      also taken.
+    skip_gather: An optimization flag. If it's set then the upper triangle part
+      of the dot interaction matrix dot(e_i, e_j) is set to 0. The resulting
+      activations will be of dimension [num_features * num_features] from which
+      half will be zeros. Otherwise activations will be only lower triangle part
+      of the interaction matrix. The later saves space but is much slower.
+    name: String name of the layer.
+  """
+
+  def __init__(self,
+               config,
+               self_interaction=False,
+               skip_gather=False,
+               name=None,
+               **kwargs):
+    self._self_interaction = config.self_interaction
+    self._skip_gather = config.skip_gather
+    super(DotInteraction, self).__init__(name=name, **kwargs)
+
+  def call(self, inputs, **kwargs):
+    """Performs the interaction operation on the tensors in the list.
+
+    The tensors represent as transformed dense features and embedded categorical
+    features.
+    Pre-condition: The tensors should all have the same shape.
+
+    Args:
+      inputs: List of features with shapes [batch_size, feature_dim].
+
+    Returns:
+      activations: Tensor representing interacted features. It has a dimension
+      `num_features * num_features` if skip_gather is True, otherside
+      `num_features * (num_features + 1) / 2` if self_interaction is True and
+      `num_features * (num_features - 1) / 2` if self_interaction is False.
+    """
+    num_features = len(inputs)
+    batch_size = tf.shape(inputs[0])[0]
+    feature_dim = tf.shape(inputs[0])[1]
+    # concat_features shape: batch_size, num_features, feature_dim
+    try:
+      concat_features = tf.concat(inputs, axis=-1)
+      concat_features = tf.reshape(concat_features,
+                                   [batch_size, -1, feature_dim])
+    except (ValueError, tf.errors.InvalidArgumentError) as e:
+      raise ValueError('Input tensors` dimensions must be equal, original'
+                       'error message: {}'.format(e))
+
+    # Interact features, select lower-triangular portion, and re-shape.
+    xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
+    ones = tf.ones_like(xactions)
+    if self._self_interaction:
+      # Selecting lower-triangular portion including the diagonal.
+      lower_tri_mask = tf.linalg.band_part(ones, -1, 0)
+      upper_tri_mask = ones - lower_tri_mask
+      out_dim = num_features * (num_features + 1) // 2
+    else:
+      # Selecting lower-triangular portion not included the diagonal.
+      upper_tri_mask = tf.linalg.band_part(ones, 0, -1)
+      lower_tri_mask = ones - upper_tri_mask
+      out_dim = num_features * (num_features - 1) // 2
+
+    if self._skip_gather:
+      # Setting upper triangle part of the interaction matrix to zeros.
+      activations = tf.where(
+          condition=tf.cast(upper_tri_mask, tf.bool),
+          x=tf.zeros_like(xactions),
+          y=xactions)
+      out_dim = num_features * num_features
+    else:
+      activations = tf.boolean_mask(xactions, lower_tri_mask)
+    activations = tf.reshape(activations, (batch_size, out_dim))
+    return activations
diff --git a/easy_rec/python/layers/numerical_embedding.py b/easy_rec/python/layers/numerical_embedding.py
index 1c45fa361..6b571a3ad 100644
--- a/easy_rec/python/layers/numerical_embedding.py
+++ b/easy_rec/python/layers/numerical_embedding.py
@@ -47,6 +47,7 @@ def __init__(self, n_tokens, d_in, d_out, bias=True, scope='nd_linear'):
         d_in: the input dimension
         d_out: the output dimension
         bias: indicates if the underlying linear layers have biases
+        scope: variable scope name
     """
     with tf.variable_scope(scope):
       self.weight = tf.get_variable(
@@ -100,6 +101,7 @@ def __init__(self, config, scope='periodic_embedding'):
         A similar grid would be ``[1e-2, 1e-1, 1e0, 1e1, 1e2]``.
         If possible, add more intermidiate values to this grid.
       config.output_3d_tensor: whether to output a 3d tensor
+      scope: variable scope name
     """
     self.config = config
     if config.embedding_dim % 2:
@@ -130,19 +132,22 @@ def __call__(self, inputs, *args, **kwargs):
         act = get_activation(self.config.linear_activation)
         if callable(act):
           emb = act(emb)
+      output = tf.reshape(emb, [-1, num_features * dim])
 
+      if self.config.output_tensor_list:
+        return output, tf.unstack(emb, axis=1)
       if self.config.output_3d_tensor:
-        return emb
-      return tf.reshape(emb, [-1, num_features * dim])
+        return output, emb
+      return output
 
 
 class AutoDisEmbedding(object):
+  """An Embedding Learning Framework for Numerical Features in CTR Prediction.
 
-  def __init__(self, config, scope='auto_dis'):
-    """An Embedding Learning Framework for Numerical Features in CTR Prediction.
+  Refer: https://arxiv.org/pdf/2012.08986v2.pdf
+  """
 
-    Refer: https://arxiv.org/pdf/2012.08986v2.pdf
-    """
+  def __init__(self, config, scope='auto_dis'):
     self.config = config
     self.emb_dim = config.embedding_dim
     self.num_bins = config.num_bins
@@ -161,22 +166,25 @@ def __call__(self, inputs, *args, **kwargs):
       mat = tf.get_variable(
           'project_mat', shape=[1, num_features, self.num_bins, self.num_bins])
 
-      x = tf.expand_dims(inputs, axis=-1)  # [B, num_fea, 1]
-      hidden = tf.nn.leaky_relu(w * x)  # [B, num_fea, num_bin]
+      x = tf.expand_dims(inputs, axis=-1)  # [B, N, 1]
+      hidden = tf.nn.leaky_relu(w * x)  # [B, N, num_bin]
 
-      y = tf.matmul(mat, hidden[..., None])  # [B, num_fea, num_bin, 1]
-      y = tf.squeeze(y, axis=3)  # [B, num_fea, num_bin]
+      y = tf.matmul(mat, hidden[..., None])  # [B, N, num_bin, 1]
+      y = tf.squeeze(y, axis=3)  # [B, N, num_bin]
 
-      # keep_prob(float): if dropout_flag is True, keep_prob rate to keep connect; (float, keep_prob=0.8)
+      # keep_prob(float): if dropout_flag is True, keep_prob rate to keep connect
       alpha = self.config.keep_prob
-      x_bar = y + alpha * hidden  # [B, num_fea, num_bin]
+      x_bar = y + alpha * hidden  # [B, N, num_bin]
       t = self.config.temperature
-      x_hat = tf.nn.softmax(x_bar / t)  # [B, num_fea, num_bin]
+      x_hat = tf.nn.softmax(x_bar / t)  # [B, N, num_bin]
+
+      emb = tf.matmul(x_hat[:, :, None, :], meta_emb)  # [B, N, 1, D]
+      emb = tf.squeeze(emb, axis=2)  # [B, N, D]
+      output = tf.reshape(emb, [-1, self.emb_dim * num_features])  # [B, N*D]
+
+      if self.config.output_tensor_list:
+        return output, tf.unstack(emb, axis=1)
 
-      emb = tf.matmul(x_hat[:, :, None, :], meta_emb)  # [B, num_fea, 1, emb_dim]
-      # emb = tf.squeeze(emb, axis=2)  # [B, num_fea, emb_dim]
       if self.config.output_3d_tensor:
-        return tf.reshape(
-            emb, [-1, num_features, self.emb_dim])  # [B, num_fea, emb_dim]
-      return tf.reshape(
-          emb, [-1, self.emb_dim * num_features])  # [B, num_fea*emb_dim]
+        return output, emb
+      return output
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index 331d0282e..f1a3189f2 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -50,7 +50,7 @@ def __init__(self,
     self._emb_reg = regularizers.l2_regularizer(self.embedding_regularization)
     self._l2_reg = regularizers.l2_regularizer(self.l2_regularization)
     # only used by model with wide feature groups, e.g. WideAndDeep
-    self._wide_output_dim = -1
+    self._wide_output_dim = self.get_wide_output_dim()
 
     self._feature_configs = feature_configs
     self.build_input_layer(model_config, feature_configs)
@@ -115,6 +115,13 @@ def l2_regularization(self):
       l2_regularization = model_config.l2_regularization
     return l2_regularization
 
+  def get_wide_output_dim(self):
+    model_config = getattr(self._base_model_config,
+                           self._base_model_config.WhichOneof('model'))
+    if hasattr(model_config, 'wide_output_dim'):
+      return model_config.wide_output_dim
+    return -1
+
   def build_input_layer(self, model_config, feature_configs):
     self._input_layer = input_layer.InputLayer(
         feature_configs,
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index 0285f225c..7d6b9e877 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -32,7 +32,7 @@ def __init__(self,
   def build_predict_graph(self):
     if not self.has_backbone:
       raise NotImplementedError(
-        'method `build_predict_graph` must be implemented when backbone network do not exits'
+          'method `build_predict_graph` must be implemented when backbone network do not exits'
       )
     output = self.backbone
 
@@ -57,9 +57,9 @@ def _output_to_prediction_impl(self,
                                  suffix=''):
     prediction_dict = {}
     binary_loss_type = {
-      LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
-      LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
-      LossType.PAIRWISE_LOGISTIC_LOSS
+        LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
+        LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
+        LossType.PAIRWISE_LOGISTIC_LOSS
     }
     if loss_type in binary_loss_type:
       assert num_class == 1, 'num_class must be 1 when loss type is %s' % loss_type.name
@@ -86,9 +86,9 @@ def _output_to_prediction_impl(self,
         prediction_dict['logits' + suffix] = output
         prediction_dict['probs' + suffix] = probs
         prediction_dict['logits' + suffix + '_y'] = math_ops.reduce_max(
-          output, axis=1)
+            output, axis=1)
         prediction_dict['probs' + suffix + '_y'] = math_ops.reduce_max(
-          probs, axis=1)
+            probs, axis=1)
         prediction_dict['y' + suffix] = tf.argmax(output, axis=1)
     elif loss_type == LossType.L2_LOSS:
       output = tf.squeeze(output, axis=1)
@@ -101,12 +101,12 @@ def _output_to_prediction_impl(self,
   def _add_to_prediction_dict(self, output):
     if len(self._losses) == 0:
       prediction_dict = self._output_to_prediction_impl(
-        output, loss_type=self._loss_type, num_class=self._num_class)
+          output, loss_type=self._loss_type, num_class=self._num_class)
       self._prediction_dict.update(prediction_dict)
     else:
       for loss in self._losses:
         prediction_dict = self._output_to_prediction_impl(
-          output, loss_type=loss.loss_type, num_class=self._num_class)
+            output, loss_type=loss.loss_type, num_class=self._num_class)
         self._prediction_dict.update(prediction_dict)
 
   def build_rtp_output_dict(self):
@@ -118,9 +118,9 @@ def build_rtp_output_dict(self):
       op = tf.get_default_graph().get_operation_by_name('rank_predict')
       if len(op.outputs) != 1:
         raise ValueError(
-          ('failed to build RTP rank_predict output: op {}[{}] has output ' +
-           'size {}, however 1 is expected.').format(op.name, op.type,
-                                                     len(op.outputs)))
+            ('failed to build RTP rank_predict output: op {}[{}] has output ' +
+             'size {}, however 1 is expected.').format(op.name, op.type,
+                                                       len(op.outputs)))
       rank_predict = op.outputs[0]
     except KeyError:
       forwarded = None
@@ -128,32 +128,32 @@ def build_rtp_output_dict(self):
       if len(self._losses) > 0:
         loss_types = {loss.loss_type for loss in self._losses}
       binary_loss_set = {
-        LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
-        LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
-        LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
-        LossType.JRC_LOSS
+          LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
+          LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
+          LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
+          LossType.JRC_LOSS
       }
       if loss_types & binary_loss_set:
         if 'probs' in self._prediction_dict:
           forwarded = self._prediction_dict['probs']
         else:
           raise ValueError(
-            'failed to build RTP rank_predict output: classification model ' +
-            "expect 'probs' prediction, which is not found. Please check if" +
-            ' build_predict_graph() is called.')
+              'failed to build RTP rank_predict output: classification model ' +
+              "expect 'probs' prediction, which is not found. Please check if" +
+              ' build_predict_graph() is called.')
       elif loss_types & {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
         if 'y' in self._prediction_dict:
           forwarded = self._prediction_dict['y']
         else:
           raise ValueError(
-            'failed to build RTP rank_predict output: regression model expect'
-            +
-            "'y' prediction, which is not found. Please check if build_predic"
-            + 't_graph() is called.')
+              'failed to build RTP rank_predict output: regression model expect'
+              +
+              "'y' prediction, which is not found. Please check if build_predic"
+              + 't_graph() is called.')
       else:
         logging.warning(
-          'failed to build RTP rank_predict: unsupported loss type {}'.format(
-            loss_types))
+            'failed to build RTP rank_predict: unsupported loss type {}'.format(
+                loss_types))
       if forwarded is not None:
         rank_predict = tf.identity(forwarded, name='rank_predict')
     if rank_predict is not None:
@@ -170,9 +170,9 @@ def _build_loss_impl(self,
                        loss_param=None):
     loss_dict = {}
     binary_loss_type = {
-      LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
-      LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
-      LossType.PAIRWISE_LOGISTIC_LOSS, LossType.JRC_LOSS
+        LossType.F1_REWEIGHTED_LOSS, LossType.PAIR_WISE_LOSS,
+        LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
+        LossType.PAIRWISE_LOGISTIC_LOSS, LossType.JRC_LOSS
     }
     if loss_type == LossType.CLASSIFICATION:
       loss_name = loss_name if loss_name else 'cross_entropy_loss' + suffix
@@ -196,23 +196,23 @@ def _build_loss_impl(self,
       if hasattr(loss_param, 'session_name'):
         kwargs['session_ids'] = self._feature_dict[loss_param.session_name]
     loss_dict[loss_name] = loss_builder.build(
-      loss_type,
-      self._labels[label_name],
-      pred,
-      loss_weight,
-      num_class,
-      loss_param=loss_param,
-      **kwargs)
+        loss_type,
+        self._labels[label_name],
+        pred,
+        loss_weight,
+        num_class,
+        loss_param=loss_param,
+        **kwargs)
     return loss_dict
 
   def build_loss_graph(self):
     loss_dict = {}
     if len(self._losses) == 0:
       loss_dict = self._build_loss_impl(
-        self._loss_type,
-        label_name=self._label_name,
-        loss_weight=self._sample_weight,
-        num_class=self._num_class)
+          self._loss_type,
+          label_name=self._label_name,
+          loss_weight=self._sample_weight,
+          num_class=self._num_class)
     else:
       strategy = self._base_model_config.loss_weight_strategy
       loss_weight = [1.0]
@@ -224,26 +224,26 @@ def build_loss_graph(self):
         if loss_param is not None:
           loss_param = getattr(loss, loss_param)
         loss_ops = self._build_loss_impl(
-          loss.loss_type,
-          label_name=self._label_name,
-          loss_weight=self._sample_weight,
-          num_class=self._num_class,
-          loss_name=loss.loss_name,
-          loss_param=loss_param)
+            loss.loss_type,
+            label_name=self._label_name,
+            loss_weight=self._sample_weight,
+            num_class=self._num_class,
+            loss_name=loss.loss_name,
+            loss_param=loss_param)
         for loss_name, loss_value in loss_ops.items():
           if strategy == self._base_model_config.Fixed:
             loss_dict[loss_name] = loss_value * loss.weight
           elif strategy == self._base_model_config.Uncertainty:
             if loss.learn_loss_weight:
               uncertainty = tf.Variable(
-                0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
+                  0, name='%s_loss_weight' % loss_name, dtype=tf.float32)
               tf.summary.scalar('loss/%s_uncertainty' % loss_name, uncertainty)
               if loss.loss_type in {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
                 loss_dict[loss_name] = 0.5 * tf.exp(
-                  -uncertainty) * loss_value + 0.5 * uncertainty
+                    -uncertainty) * loss_value + 0.5 * uncertainty
               else:
                 loss_dict[loss_name] = tf.exp(
-                  -uncertainty) * loss_value + 0.5 * uncertainty
+                    -uncertainty) * loss_value + 0.5 * uncertainty
             else:
               loss_dict[loss_name] = loss_value * loss.weight
           elif strategy == self._base_model_config.Random:
@@ -272,10 +272,10 @@ def _build_metric_impl(self,
     from easy_rec.python.core.easyrec_metrics import metrics_tf
     from easy_rec.python.core import metrics as metrics_lib
     binary_loss_set = {
-      LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
-      LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
-      LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
-      LossType.JRC_LOSS
+        LossType.CLASSIFICATION, LossType.F1_REWEIGHTED_LOSS,
+        LossType.PAIR_WISE_LOSS, LossType.BINARY_FOCAL_LOSS,
+        LossType.PAIRWISE_FOCAL_LOSS, LossType.PAIRWISE_LOGISTIC_LOSS,
+        LossType.JRC_LOSS
     }
     metric_dict = {}
     if metric.WhichOneof('metric') == 'auc':
@@ -283,15 +283,15 @@ def _build_metric_impl(self,
       if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['auc' + suffix] = metrics_tf.auc(
-          label,
-          self._prediction_dict['probs' + suffix],
-          num_thresholds=metric.auc.num_thresholds)
+            label,
+            self._prediction_dict['probs' + suffix],
+            num_thresholds=metric.auc.num_thresholds)
       elif num_class == 2:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['auc' + suffix] = metrics_tf.auc(
-          label,
-          self._prediction_dict['probs' + suffix][:, 1],
-          num_thresholds=metric.auc.num_thresholds)
+            label,
+            self._prediction_dict['probs' + suffix][:, 1],
+            num_thresholds=metric.auc.num_thresholds)
       else:
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'gauc':
@@ -301,20 +301,20 @@ def _build_metric_impl(self,
         uids = self._feature_dict[metric.gauc.uid_field]
         if isinstance(uids, tf.sparse.SparseTensor):
           uids = tf.sparse_to_dense(
-            uids.indices, uids.dense_shape, uids.values, default_value='')
+              uids.indices, uids.dense_shape, uids.values, default_value='')
           uids = tf.reshape(uids, [-1])
         metric_dict['gauc' + suffix] = metrics_lib.gauc(
-          label,
-          self._prediction_dict['probs' + suffix],
-          uids=uids,
-          reduction=metric.gauc.reduction)
+            label,
+            self._prediction_dict['probs' + suffix],
+            uids=uids,
+            reduction=metric.gauc.reduction)
       elif num_class == 2:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['gauc' + suffix] = metrics_lib.gauc(
-          label,
-          self._prediction_dict['probs' + suffix][:, 1],
-          uids=self._feature_dict[metric.gauc.uid_field],
-          reduction=metric.gauc.reduction)
+            label,
+            self._prediction_dict['probs' + suffix][:, 1],
+            uids=self._feature_dict[metric.gauc.uid_field],
+            reduction=metric.gauc.reduction)
       else:
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'session_auc':
@@ -322,17 +322,17 @@ def _build_metric_impl(self,
       if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['session_auc' + suffix] = metrics_lib.session_auc(
-          label,
-          self._prediction_dict['probs' + suffix],
-          session_ids=self._feature_dict[metric.session_auc.session_id_field],
-          reduction=metric.session_auc.reduction)
+            label,
+            self._prediction_dict['probs' + suffix],
+            session_ids=self._feature_dict[metric.session_auc.session_id_field],
+            reduction=metric.session_auc.reduction)
       elif num_class == 2:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['session_auc' + suffix] = metrics_lib.session_auc(
-          label,
-          self._prediction_dict['probs' + suffix][:, 1],
-          session_ids=self._feature_dict[metric.session_auc.session_id_field],
-          reduction=metric.session_auc.reduction)
+            label,
+            self._prediction_dict['probs' + suffix][:, 1],
+            session_ids=self._feature_dict[metric.session_auc.session_id_field],
+            reduction=metric.session_auc.reduction)
       else:
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'max_f1':
@@ -340,11 +340,11 @@ def _build_metric_impl(self,
       if num_class == 1 or loss_type & {LossType.JRC_LOSS}:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['max_f1' + suffix] = metrics_lib.max_f1(
-          label, self._prediction_dict['logits' + suffix])
+            label, self._prediction_dict['logits' + suffix])
       elif num_class == 2:
         label = tf.to_int64(self._labels[label_name])
         metric_dict['max_f1' + suffix] = metrics_lib.max_f1(
-          label, self._prediction_dict['logits' + suffix][:, 1])
+            label, self._prediction_dict['logits' + suffix][:, 1])
       else:
         raise ValueError('Wrong class number')
     elif metric.WhichOneof('metric') == 'recall_at_topk':
@@ -352,18 +352,18 @@ def _build_metric_impl(self,
       assert num_class > 1
       label = tf.to_int64(self._labels[label_name])
       metric_dict['recall_at_topk' + suffix] = metrics_tf.recall_at_k(
-        label, self._prediction_dict['logits' + suffix],
-        metric.recall_at_topk.topk)
+          label, self._prediction_dict['logits' + suffix],
+          metric.recall_at_topk.topk)
     elif metric.WhichOneof('metric') == 'mean_absolute_error':
       label = tf.to_float(self._labels[label_name])
       if loss_type & {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
         metric_dict['mean_absolute_error' +
                     suffix] = metrics_tf.mean_absolute_error(
-          label, self._prediction_dict['y' + suffix])
+                        label, self._prediction_dict['y' + suffix])
       elif loss_type & {LossType.CLASSIFICATION} and num_class == 1:
         metric_dict['mean_absolute_error' +
                     suffix] = metrics_tf.mean_absolute_error(
-          label, self._prediction_dict['probs' + suffix])
+                        label, self._prediction_dict['probs' + suffix])
       else:
         assert False, 'mean_absolute_error is not supported for this model'
     elif metric.WhichOneof('metric') == 'mean_squared_error':
@@ -371,11 +371,11 @@ def _build_metric_impl(self,
       if loss_type & {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
         metric_dict['mean_squared_error' +
                     suffix] = metrics_tf.mean_squared_error(
-          label, self._prediction_dict['y' + suffix])
+                        label, self._prediction_dict['y' + suffix])
       elif num_class == 1 and loss_type & binary_loss_set:
         metric_dict['mean_squared_error' +
                     suffix] = metrics_tf.mean_squared_error(
-          label, self._prediction_dict['probs' + suffix])
+                        label, self._prediction_dict['probs' + suffix])
       else:
         assert False, 'mean_squared_error is not supported for this model'
     elif metric.WhichOneof('metric') == 'root_mean_squared_error':
@@ -383,11 +383,11 @@ def _build_metric_impl(self,
       if loss_type & {LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS}:
         metric_dict['root_mean_squared_error' +
                     suffix] = metrics_tf.root_mean_squared_error(
-          label, self._prediction_dict['y' + suffix])
+                        label, self._prediction_dict['y' + suffix])
       elif loss_type & {LossType.CLASSIFICATION} and num_class == 1:
         metric_dict['root_mean_squared_error' +
                     suffix] = metrics_tf.root_mean_squared_error(
-          label, self._prediction_dict['probs' + suffix])
+                        label, self._prediction_dict['probs' + suffix])
       else:
         assert False, 'root_mean_squared_error is not supported for this model'
     elif metric.WhichOneof('metric') == 'accuracy':
@@ -395,7 +395,7 @@ def _build_metric_impl(self,
       assert num_class > 1
       label = tf.to_int64(self._labels[label_name])
       metric_dict['accuracy' + suffix] = metrics_tf.accuracy(
-        label, self._prediction_dict['y' + suffix])
+          label, self._prediction_dict['y' + suffix])
     return metric_dict
 
   def build_metric_graph(self, eval_config):
@@ -405,18 +405,18 @@ def build_metric_graph(self, eval_config):
       loss_types = {loss.loss_type for loss in self._losses}
     for metric in eval_config.metrics_set:
       metric_dict.update(
-        self._build_metric_impl(
-          metric,
-          loss_type=loss_types,
-          label_name=self._label_name,
-          num_class=self._num_class))
+          self._build_metric_impl(
+              metric,
+              loss_type=loss_types,
+              label_name=self._label_name,
+              num_class=self._num_class))
     return metric_dict
 
   def _get_outputs_impl(self, loss_type, num_class=1, suffix=''):
     binary_loss_set = {
-      LossType.F1_REWEIGHTED_LOSS, LossType.JRC_LOSS, LossType.PAIR_WISE_LOSS,
-      LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
-      LossType.PAIRWISE_LOGISTIC_LOSS
+        LossType.F1_REWEIGHTED_LOSS, LossType.JRC_LOSS, LossType.PAIR_WISE_LOSS,
+        LossType.BINARY_FOCAL_LOSS, LossType.PAIRWISE_FOCAL_LOSS,
+        LossType.PAIRWISE_LOGISTIC_LOSS
     }
     if loss_type in binary_loss_set:
       return ['probs' + suffix, 'logits' + suffix]
@@ -425,8 +425,8 @@ def _get_outputs_impl(self, loss_type, num_class=1, suffix=''):
         return ['probs' + suffix, 'logits' + suffix]
       else:
         return [
-          'y' + suffix, 'probs' + suffix, 'logits' + suffix,
-          'probs' + suffix + '_y', 'logits' + suffix + '_y'
+            'y' + suffix, 'probs' + suffix, 'logits' + suffix,
+            'probs' + suffix + '_y', 'logits' + suffix + '_y'
         ]
     elif loss_type in [LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS]:
       return ['y' + suffix]
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
index b77be93be..b37b14b2c 100644
--- a/easy_rec/python/protos/backbone.proto
+++ b/easy_rec/python/protos/backbone.proto
@@ -16,37 +16,24 @@ message Lambda {
     required string expression = 1;
 }
 
-message Operator {
-    oneof Op {
-        MLP mlp = 102;
-        PeriodicEmbedding periodic_embedding = 103;
-        AutoDisEmbedding auto_dis_embedding = 104;
-        SequenceLayer sequence_encoder = 105;
-        HighWayTower highway = 106;
-        MaskNet masknet = 107;
-        SENet senet = 108;
-        FiBiNetTower fibinet = 109;
-        FM fm = 110;
-        Concatenate concat = 111;
-        Reshape reshape = 112;
-        Add add = 113;
-        Dot dot = 114;
-        Lambda Lambda = 115;
-        OpChain chain = 116;
-    }
+message Input {
+    required string name = 1;
+    optional string input_fn = 2;
 }
 
-message OpChain {
-    repeated Operator ops = 1;
+message KerasLayer {
+    required string class_name = 1;
+    optional Any params = 2;
 }
 
 message Block {
     required string name = 1;
     // the input names of feature groups or other blocks
-    repeated string inputs = 2;
+    repeated Input inputs = 2;
     optional int32 input_concat_axis = 3 [default = -1];
     optional string extra_input_fn = 4;
     oneof layer {
+        Lambda Lambda = 100;
         InputLayer input_layer = 101;
         MLP mlp = 102;
         PeriodicEmbedding periodic_embedding = 103;
@@ -57,12 +44,11 @@ message Block {
         SENet senet = 108;
         FiBiNetTower fibinet = 109;
         FM fm = 110;
-        Concatenate concat = 111;
-        Reshape reshape = 112;
+        // Concatenate concat = 111;
+        // Reshape reshape = 112;
         Add add = 113;
         Dot dot = 114;
-        Lambda Lambda = 115;
-        OpChain chain = 116;
+        //OpChain chain = 116;
     }
 }
 
@@ -71,3 +57,26 @@ message BackboneTower {
     repeated string concat_blocks = 2;
     optional MLP top_mlp = 3;
 }
+
+//message Operator {
+//    oneof Op {
+//        MLP mlp = 102;
+//        PeriodicEmbedding periodic_embedding = 103;
+//        AutoDisEmbedding auto_dis_embedding = 104;
+//        HighWayTower highway = 106;
+//        MaskNet masknet = 107;
+//        SENet senet = 108;
+//        FiBiNetTower fibinet = 109;
+//        FM fm = 110;
+//        Concatenate concat = 111;
+//        Reshape reshape = 112;
+//        Add add = 113;
+//        Dot dot = 114;
+//        Lambda Lambda = 115;
+//        OpChain chain = 116;
+//    }
+//}
+//
+//message OpChain {
+//    repeated Operator ops = 1;
+//}
diff --git a/easy_rec/python/protos/dnn.proto b/easy_rec/python/protos/dnn.proto
index 1564394eb..00fe79d82 100644
--- a/easy_rec/python/protos/dnn.proto
+++ b/easy_rec/python/protos/dnn.proto
@@ -24,4 +24,4 @@ message MLP {
     optional bool use_bn = 4 [default = true];
     optional bool last_layer_no_activation = 5 [default = false];
     optional bool last_layer_no_batch_norm = 6 [default = false];
-}
\ No newline at end of file
+}
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 940ee88f3..48c6f4f8d 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -33,6 +33,7 @@ message DummyModel {
 message RankModel {
   optional float l2_regularization = 1;
   optional bool add_head_logits_layer = 2 [default=true];
+  optional uint32 wide_output_dim = 3;
 }
 
 // for knowledge distillation
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index 576bfdf4f..e7ad65460 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -8,8 +8,9 @@ message InputLayer {
     optional bool do_layer_norm = 2;
     optional float dropout_rate = 3;
     optional float feature_dropout_rate = 4;
-    optional bool output_feature_list = 5;
-    optional bool output_3d_tensor = 6;
+    optional bool only_output_feature_list = 5;
+    optional bool only_output_3d_tensor = 6;
+    optional bool output_2d_tensor_and_feature_list = 7;
 }
 
 message HighWayTower {
@@ -25,6 +26,7 @@ message PeriodicEmbedding {
     optional bool add_linear_layer = 3 [default = true];
     optional string linear_activation = 4 [default = 'relu'];
     optional bool output_3d_tensor = 5;
+    optional bool output_tensor_list = 6;
 }
 
 message AutoDisEmbedding {
@@ -33,6 +35,7 @@ message AutoDisEmbedding {
     required float keep_prob = 3 [default = 0.8];
     required float temperature = 4;
     optional bool output_3d_tensor = 5;
+    optional bool output_tensor_list = 6;
 }
 
 message Concatenate {
@@ -49,4 +52,4 @@ message Add {
 }
 
 message Dot {
-}
\ No newline at end of file
+}
diff --git a/easy_rec/python/train_eval.py b/easy_rec/python/train_eval.py
index 51c904451..f12784ac1 100644
--- a/easy_rec/python/train_eval.py
+++ b/easy_rec/python/train_eval.py
@@ -95,12 +95,11 @@
       help='is use check mode')
   parser.add_argument(
       '--selected_cols', type=str, default=None, help='select input columns')
-  parser.add_argument(
-    '--gpu', type=str, default=None, help='gpu id')
+  parser.add_argument('--gpu', type=str, default=None, help='gpu id')
   args, extra_args = parser.parse_known_args()
 
   if args.gpu is not None:
-    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
 
   edit_config_json = {}
   if args.edit_config_json:
diff --git a/easy_rec/python/utils/__init__.py b/easy_rec/python/utils/__init__.py
index 8a9b460ac..09dc89476 100644
--- a/easy_rec/python/utils/__init__.py
+++ b/easy_rec/python/utils/__init__.py
@@ -1,17 +1,15 @@
-
 class conditional(object):
-    """Wrap another context manager and enter it only if condition is true.
-    """
+  """Wrap another context manager and enter it only if condition is true."""
 
-    def __init__(self, condition, contextmanager):
-        self.condition = condition
-        self.contextmanager = contextmanager
+  def __init__(self, condition, contextmanager):
+    self.condition = condition
+    self.contextmanager = contextmanager
 
-    def __enter__(self):
-        """Conditionally enter a context manager."""
-        if self.condition:
-            return self.contextmanager.__enter__()
+  def __enter__(self):
+    """Conditionally enter a context manager."""
+    if self.condition:
+      return self.contextmanager.__enter__()
 
-    def __exit__(self, *args):
-        if self.condition:
-            return self.contextmanager.__exit__(*args)
+  def __exit__(self, *args):
+    if self.condition:
+      return self.contextmanager.__exit__(*args)
diff --git a/easy_rec/python/utils/load_class.py b/easy_rec/python/utils/load_class.py
index 2da1e4e41..efd2cc9cb 100644
--- a/easy_rec/python/utils/load_class.py
+++ b/easy_rec/python/utils/load_class.py
@@ -220,3 +220,30 @@ def create_class(cls, name):
       return newclass
 
   return RegisterABCMeta
+
+
+def load_keras_layer(name):
+  """Load keras layer class.
+
+  Args:
+    name: keras layer name
+
+  Return:
+    modules or functions or classes
+  """
+  name = name.strip()
+  if name == '' or name is None:
+    return None
+
+  path = 'easy_rec.python.layers.keras.' + name
+  try:
+    return pydoc.locate(path)
+  except pydoc.ErrorDuringImport:
+    path = 'tensorflow.keras.layers.' + name
+    try:
+      return pydoc.locate(path)
+    except pydoc.ErrorDuringImport:
+      print('load keras layer %s failed' % name)
+      logging.error('load keras layer %s failed: %s' %
+                    (name, traceback.format_exc()))
+      return None
diff --git a/easy_rec/python/utils/tf_utils.py b/easy_rec/python/utils/tf_utils.py
index efcd7df12..e4d39c012 100644
--- a/easy_rec/python/utils/tf_utils.py
+++ b/easy_rec/python/utils/tf_utils.py
@@ -62,7 +62,7 @@ def dot_op(features):
   """Compute inner dot between any two pair tensors.
 
   Args:
-    features:
+    features: must be one of
     - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
     - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
   Return:
diff --git a/examples/configs/deepfm_backbone_on_criteo.config b/examples/configs/deepfm_backbone_on_criteo.config
index c94838daf..467d8ad55 100644
--- a/examples/configs/deepfm_backbone_on_criteo.config
+++ b/examples/configs/deepfm_backbone_on_criteo.config
@@ -1,14 +1,17 @@
 train_input_path: "examples/data/criteo/criteo_train_data"
 eval_input_path: "examples/data/criteo/criteo_test_data"
-model_dir: "examples/ckpt/deepfm_backbone_criteo"
+model_dir: "examples/ckpt/deepfm_backbone_criteo_w"
 
 train_config {
   log_step_count_steps: 500
   optimizer_config: {
     adam_optimizer: {
       learning_rate: {
-        constant_learning_rate {
-          learning_rate: 0.001
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
         }
       }
     }
@@ -328,19 +331,19 @@ feature_config: {
   }
   features: {
     input_names: "C1"
-    hash_bucket_size: 2000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C2"
-    hash_bucket_size: 1000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C3"
-    hash_bucket_size: 2500000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -352,132 +355,132 @@ feature_config: {
   }
   features: {
     input_names: "C5"
-    hash_bucket_size: 500
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C6"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C7"
-    hash_bucket_size: 13000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C8"
-    hash_bucket_size: 1000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C9"
-    hash_bucket_size: 10
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C10"
-    hash_bucket_size: 100000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C11"
-    hash_bucket_size: 6000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C12"
-    hash_bucket_size: 2000000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C13"
-    hash_bucket_size: 4000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C14"
-    hash_bucket_size: 100
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C15"
-    hash_bucket_size: 20000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C16"
-    hash_bucket_size: 1250000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C17"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C18"
-    hash_bucket_size: 6000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C19"
-    hash_bucket_size: 3000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C20"
-    hash_bucket_size: 10
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C21"
-    hash_bucket_size: 1250000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C22"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C23"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C24"
-    hash_bucket_size: 280000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }features: {
     input_names: "C25"
-    hash_bucket_size: 200
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C26"
-    hash_bucket_size: 150000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -485,7 +488,7 @@ feature_config: {
 model_config: {
   model_class: 'RankModel'
   feature_groups: {
-    group_name: "features"
+    group_name: "deep_features"
     feature_names: "F1"
     feature_names: "F2"
     feature_names: "F3"
@@ -527,35 +530,98 @@ model_config: {
     feature_names: "C26"
     wide_deep:DEEP
   }
+  feature_groups: {
+    group_name: "wide_features"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:WIDE
+  }
   backbone {
     blocks {
-      name: 'emb_list'
-      inputs: 'features'
+      name: 'wide_features'
       input_layer {
-        output_feature_list: true
+      }
+    }
+    blocks {
+      name: 'wide_logit'
+      inputs {
+        name: 'wide_features'
+      }
+      Lambda {
+        expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
+      }
+    }
+    blocks {
+      name: 'deep_features'
+      input_layer {
+        output_2d_tensor_and_feature_list: true
       }
     }
     blocks {
       name: 'fm'
-      inputs: 'emb_list'
+      inputs {
+        name: 'deep_features'
+        input_fn: 'lambda x: x[1]'
+      }
       fm {
         use_variant: true
       }
     }
     blocks {
       name: 'deep'
-      inputs: 'features'
+      inputs {
+        name: 'deep_features'
+        input_fn: 'lambda x: x[0]'
+      }
       mlp {
         hidden_units: [256, 128, 64]
       }
     }
-    concat_blocks: ['fm', 'deep']
+    concat_blocks: ['wide_logit', 'fm', 'deep']
     top_mlp {
       hidden_units: [256, 128, 64]
     }
   }
   rank_model {
     l2_regularization: 1e-5
+    wide_output_dim: 1
   }
   embedding_regularization: 1e-5
 }
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
index 04dde5589..970508598 100644
--- a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
+++ b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
@@ -7,8 +7,11 @@ train_config {
   optimizer_config: {
     adam_optimizer: {
       learning_rate: {
-        constant_learning_rate {
-          learning_rate: 0.001
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
         }
       }
     }
@@ -315,19 +318,19 @@ feature_config: {
   }
   features: {
     input_names: "C1"
-    hash_bucket_size: 2000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C2"
-    hash_bucket_size: 1000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C3"
-    hash_bucket_size: 2500000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -339,135 +342,239 @@ feature_config: {
   }
   features: {
     input_names: "C5"
-    hash_bucket_size: 500
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C6"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C7"
-    hash_bucket_size: 13000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C8"
-    hash_bucket_size: 1000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C9"
-    hash_bucket_size: 10
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C10"
-    hash_bucket_size: 100000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C11"
-    hash_bucket_size: 6000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C12"
-    hash_bucket_size: 2000000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C13"
-    hash_bucket_size: 4000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C14"
-    hash_bucket_size: 100
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C15"
-    hash_bucket_size: 20000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C16"
-    hash_bucket_size: 1250000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C17"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C18"
-    hash_bucket_size: 6000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C19"
-    hash_bucket_size: 3000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C20"
-    hash_bucket_size: 10
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C21"
-    hash_bucket_size: 1250000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C22"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C23"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C24"
-    hash_bucket_size: 280000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }features: {
     input_names: "C25"
-    hash_bucket_size: 200
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C26"
-    hash_bucket_size: 150000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
+  features: {
+    feature_name: "D1"
+    input_names: "F1"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    feature_name: "D2"
+    input_names: "F2"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    feature_name: "D3"
+    input_names: "F3"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    feature_name: "D4"
+    input_names: "F4"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    feature_name: "D5"
+    input_names: "F5"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    feature_name: "D6"
+    input_names: "F6"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    feature_name: "D7"
+    input_names: "F7"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    feature_name: "D8"
+    input_names: "F8"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    feature_name: "D9"
+    input_names: "F9"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    feature_name: "D10"
+    input_names: "F10"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    feature_name: "D11"
+    input_names: "F11"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    feature_name: "D12"
+    input_names: "F12"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    feature_name: "D13"
+    input_names: "F13"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
 }
 model_config: {
   model_class: 'RankModel'
@@ -518,56 +625,114 @@ model_config: {
     feature_names: "C26"
     wide_deep:DEEP
   }
+  feature_groups: {
+    group_name: "wide_features"
+    feature_names: "D1"
+    feature_names: "D2"
+    feature_names: "D3"
+    feature_names: "D4"
+    feature_names: "D5"
+    feature_names: "D6"
+    feature_names: "D7"
+    feature_names: "D8"
+    feature_names: "D9"
+    feature_names: "D10"
+    feature_names: "D11"
+    feature_names: "D12"
+    feature_names: "D13"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:WIDE
+  }
   backbone {
     blocks {
-      name: 'cat_emb'
-      inputs: 'categorical_features'
-      input_layer {
-        output_3d_tensor: true
+      name: 'wide_logit'
+      inputs {
+        name: 'wide_features'
+      }
+      Lambda {
+        expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
       }
     }
     blocks {
       name: 'num_emb'
-      inputs: 'numerical_features'
+      inputs {
+        name: 'numerical_features'
+      }
       auto_dis_embedding {
         embedding_dim: 16
         num_bins: 20
         temperature: 0.815
-        output_3d_tensor: true
+        output_tensor_list: true
       }
     }
     blocks {
-      name: 'fm'
-      inputs: 'cat_emb'
-      inputs: 'num_emb'
-      input_concat_axis: 1
-      fm {
-        use_variant: true
+      name: 'categorical_features'
+      input_layer {
+        output_2d_tensor_and_feature_list: true
       }
     }
     blocks {
-      name: 'cat_and_num'
-      inputs: 'cat_emb'
-      inputs: 'num_emb'
-      input_concat_axis: 1
-      reshape {
-        dims: [-1, 624]
+      name: 'fm'
+      inputs {
+        name: 'categorical_features'
+        input_fn: 'lambda x: x[1]'
+      }
+      inputs {
+        name: 'num_emb'
+        input_fn: 'lambda x: x[1]'
+      }
+      fm {
+        use_variant: true
       }
     }
     blocks {
       name: 'deep'
-      inputs: 'cat_and_num'
+      inputs {
+        name: 'categorical_features'
+        input_fn: 'lambda x: x[0]'
+      }
+      inputs {
+        name: 'num_emb'
+        input_fn: 'lambda x: x[0]'
+      }
       mlp {
         hidden_units: [256, 128, 64]
       }
     }
-    concat_blocks: ['fm', 'deep']
+    // no wide_logit may have better performance
+    concat_blocks: ['wide_logit', 'fm', 'deep']
     top_mlp {
       hidden_units: [256, 128, 64]
     }
   }
   rank_model {
     l2_regularization: 1e-5
+    wide_output_dim: 1
   }
   embedding_regularization: 1e-5
 }
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_periodic.config b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
index 2affcc9ae..82dd01998 100644
--- a/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
+++ b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
@@ -7,8 +7,11 @@ train_config {
   optimizer_config: {
     adam_optimizer: {
       learning_rate: {
-        constant_learning_rate {
-          learning_rate: 0.001
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
         }
       }
     }
@@ -315,19 +318,19 @@ feature_config: {
   }
   features: {
     input_names: "C1"
-    hash_bucket_size: 2000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C2"
-    hash_bucket_size: 1000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C3"
-    hash_bucket_size: 2500000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -339,135 +342,239 @@ feature_config: {
   }
   features: {
     input_names: "C5"
-    hash_bucket_size: 500
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C6"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C7"
-    hash_bucket_size: 13000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C8"
-    hash_bucket_size: 1000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C9"
-    hash_bucket_size: 10
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C10"
-    hash_bucket_size: 100000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C11"
-    hash_bucket_size: 6000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C12"
-    hash_bucket_size: 2000000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C13"
-    hash_bucket_size: 4000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C14"
-    hash_bucket_size: 100
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C15"
-    hash_bucket_size: 20000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C16"
-    hash_bucket_size: 1250000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C17"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C18"
-    hash_bucket_size: 6000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C19"
-    hash_bucket_size: 3000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C20"
-    hash_bucket_size: 10
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C21"
-    hash_bucket_size: 1250000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C22"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C23"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C24"
-    hash_bucket_size: 280000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }features: {
     input_names: "C25"
-    hash_bucket_size: 200
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C26"
-    hash_bucket_size: 150000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
+  features: {
+    feature_name: "D1"
+    input_names: "F1"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    feature_name: "D2"
+    input_names: "F2"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    feature_name: "D3"
+    input_names: "F3"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    feature_name: "D4"
+    input_names: "F4"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    feature_name: "D5"
+    input_names: "F5"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    feature_name: "D6"
+    input_names: "F6"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    feature_name: "D7"
+    input_names: "F7"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    feature_name: "D8"
+    input_names: "F8"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    feature_name: "D9"
+    input_names: "F9"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    feature_name: "D10"
+    input_names: "F10"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    feature_name: "D11"
+    input_names: "F11"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    feature_name: "D12"
+    input_names: "F12"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    feature_name: "D13"
+    input_names: "F13"
+    embedding_dim:16
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
 }
 model_config: {
   model_class: 'RankModel'
@@ -518,54 +625,112 @@ model_config: {
     feature_names: "C26"
     wide_deep:DEEP
   }
+  feature_groups: {
+    group_name: "wide_features"
+    feature_names: "D1"
+    feature_names: "D2"
+    feature_names: "D3"
+    feature_names: "D4"
+    feature_names: "D5"
+    feature_names: "D6"
+    feature_names: "D7"
+    feature_names: "D8"
+    feature_names: "D9"
+    feature_names: "D10"
+    feature_names: "D11"
+    feature_names: "D12"
+    feature_names: "D13"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:WIDE
+  }
   backbone {
     blocks {
-      name: 'cat_emb'
-      inputs: 'categorical_features'
-      input_layer {
-        output_3d_tensor: true
+      name: 'wide_logit'
+      inputs {
+        name: 'wide_features'
+      }
+      Lambda {
+        expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
       }
     }
     blocks {
       name: 'num_emb'
-      inputs: 'numerical_features'
+      inputs {
+        name: 'numerical_features'
+      }
       periodic_embedding {
         embedding_dim: 16
-        output_3d_tensor: true
+        sigma: 0.005
+        output_tensor_list: true
       }
     }
     blocks {
-      name: 'fm'
-      inputs: 'cat_emb'
-      inputs: 'num_emb'
-      input_concat_axis: 1
-      fm {
-        use_variant: true
+      name: 'categorical_features'
+      input_layer {
+        output_2d_tensor_and_feature_list: true
       }
     }
     blocks {
-      name: 'cat_and_num'
-      inputs: 'cat_emb'
-      inputs: 'num_emb'
-      input_concat_axis: 1
-      reshape {
-        dims: [-1, 624]
+      name: 'fm'
+      inputs {
+        name: 'categorical_features'
+        input_fn: 'lambda x: x[1]'
+      }
+      inputs {
+        name: 'num_emb'
+        input_fn: 'lambda x: x[1]'
+      }
+      fm {
+        use_variant: true
       }
     }
     blocks {
       name: 'deep'
-      inputs: 'cat_and_num'
+      inputs {
+        name: 'categorical_features'
+        input_fn: 'lambda x: x[0]'
+      }
+      inputs {
+        name: 'num_emb'
+        input_fn: 'lambda x: x[0]'
+      }
       mlp {
         hidden_units: [256, 128, 64]
       }
     }
-    concat_blocks: ['fm', 'deep']
+    concat_blocks: ['wide_logit', 'fm', 'deep']
     top_mlp {
       hidden_units: [256, 128, 64]
     }
   }
   rank_model {
     l2_regularization: 1e-5
+    wide_output_dim: 1
   }
   embedding_regularization: 1e-5
 }
diff --git a/examples/configs/dlrm_backbone_on_criteo.config b/examples/configs/dlrm_backbone_on_criteo.config
index 7d698e858..e87acef39 100644
--- a/examples/configs/dlrm_backbone_on_criteo.config
+++ b/examples/configs/dlrm_backbone_on_criteo.config
@@ -1,3 +1,4 @@
+# align with raw dlrm model
 train_input_path: "examples/data/criteo/criteo_train_data"
 eval_input_path: "examples/data/criteo/criteo_test_data"
 model_dir: "examples/ckpt/dlrm_backbone_criteo"
@@ -7,8 +8,11 @@ train_config {
   optimizer_config: {
     adam_optimizer: {
       learning_rate: {
-        constant_learning_rate {
-          learning_rate: 0.001
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
         }
       }
     }
@@ -315,19 +319,19 @@ feature_config: {
   }
   features: {
     input_names: "C1"
-    hash_bucket_size: 2000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C2"
-    hash_bucket_size: 1000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C3"
-    hash_bucket_size: 2500000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -339,132 +343,132 @@ feature_config: {
   }
   features: {
     input_names: "C5"
-    hash_bucket_size: 500
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C6"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C7"
-    hash_bucket_size: 13000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C8"
-    hash_bucket_size: 1000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C9"
-    hash_bucket_size: 10
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C10"
-    hash_bucket_size: 100000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C11"
-    hash_bucket_size: 6000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C12"
-    hash_bucket_size: 2000000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C13"
-    hash_bucket_size: 4000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C14"
-    hash_bucket_size: 100
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C15"
-    hash_bucket_size: 20000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C16"
-    hash_bucket_size: 1250000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C17"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C18"
-    hash_bucket_size: 6000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C19"
-    hash_bucket_size: 3000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C20"
-    hash_bucket_size: 10
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C21"
-    hash_bucket_size: 1250000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C22"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C23"
-    hash_bucket_size: 50
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C24"
-    hash_bucket_size: 280000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }features: {
     input_names: "C25"
-    hash_bucket_size: 200
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
   features: {
     input_names: "C26"
-    hash_bucket_size: 150000
+    hash_bucket_size: 1000000
     feature_type: IdFeature
     embedding_dim: 16
   }
@@ -521,42 +525,41 @@ model_config: {
   backbone {
     blocks {
       name: 'bottom_mlp'
-      inputs: 'dense'
+      inputs {
+        name: 'dense'
+      }
       mlp {
         hidden_units: [64, 32, 16]
       }
     }
     blocks {
-      name: 'bottom_list'
-      inputs: 'bottom_mlp'
-      Lambda {
-        expression: 'lambda x: [x]'
-      }
-    }
-    blocks {
-      name: 'sparse_features'
-      inputs: 'sparse'
+      name: 'sparse'
       input_layer {
-        output_feature_list: true
+        output_2d_tensor_and_feature_list: true
       }
     }
     blocks {
       name: 'dot'
-      inputs: 'bottom_list'
-      inputs: 'sparse_features'
+      inputs {
+        name: 'bottom_mlp'
+        input_fn: 'lambda x: [x]'
+      }
+      inputs {
+        name: 'sparse'
+        input_fn: 'lambda x: x[1]'
+      }
       dot { }
     }
     blocks {
-      name: 'dot_and_dense'
-      inputs: 'bottom_mlp'
-      inputs: 'dot'
-      concat {
-        axis: 1
+      name: 'sparse_2d'
+      inputs {
+        name: 'sparse'
+        input_fn: 'lambda x: x[0]'
       }
     }
-    concat_blocks: ['dot_and_dense']
+    concat_blocks: ['sparse_2d', 'dot']
     top_mlp {
-      hidden_units: [128, 64]
+      hidden_units: [256, 128, 64]
     }
   }
   rank_model {
diff --git a/examples/configs/dlrm_on_criteo.config b/examples/configs/dlrm_on_criteo.config
new file mode 100644
index 000000000..e6c45d574
--- /dev/null
+++ b/examples/configs/dlrm_on_criteo.config
@@ -0,0 +1,534 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_criteo_ckpt"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_class: 'DLRM'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  dlrm {
+    bot_dnn {
+      hidden_units: [64, 32, 16]
+    }
+    top_dnn {
+      hidden_units: [256, 128, 64]
+    }
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/dlrm_on_criteo_with_autodis.config b/examples/configs/dlrm_on_criteo_with_autodis.config
new file mode 100644
index 000000000..eb81e0a05
--- /dev/null
+++ b/examples/configs/dlrm_on_criteo_with_autodis.config
@@ -0,0 +1,578 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_autodis_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'num_emb'
+      inputs {
+        name: 'dense'
+      }
+      auto_dis_embedding {
+        embedding_dim: 16
+        num_bins: 20
+        temperature: 0.815
+        output_tensor_list: true
+      }
+    }
+    blocks {
+      name: 'sparse'
+      input_layer {
+        output_2d_tensor_and_feature_list: true
+      }
+    }
+    blocks {
+      name: 'dot'
+      inputs {
+        name: 'num_emb'
+        input_fn: 'lambda x: x[1]'
+      }
+      inputs {
+        name: 'sparse'
+        input_fn: 'lambda x: x[1]'
+      }
+      dot { }
+    }
+    blocks {
+      name: 'sparse_2d'
+      inputs {
+        name: 'sparse'
+        input_fn: 'lambda x: x[0]'
+      }
+    }
+    blocks {
+      name: 'num_emb_2d'
+      inputs {
+        name: 'num_emb'
+        input_fn: 'lambda x: x[0]'
+      }
+    }
+    concat_blocks: ['num_emb_2d', 'dot', 'sparse_2d']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/dlrm_standard_on_criteo.config b/examples/configs/dlrm_standard_on_criteo.config
new file mode 100644
index 000000000..131a94607
--- /dev/null
+++ b/examples/configs/dlrm_standard_on_criteo.config
@@ -0,0 +1,560 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_standard_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'bottom_mlp'
+      inputs {
+        name: 'dense'
+      }
+      mlp {
+        hidden_units: [64, 32, 16]
+      }
+    }
+    blocks {
+      name: 'sparse'
+      input_layer {
+        only_output_feature_list: true
+      }
+    }
+    blocks {
+      name: 'dot'
+      inputs {
+        name: 'bottom_mlp'
+        input_fn: 'lambda x: [x]'
+      }
+      inputs {
+        name: 'sparse'
+      }
+      dot { }
+    }
+    concat_blocks: ['bottom_mlp', 'dot']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/data/criteo/process_criteo_kaggle.py b/examples/data/criteo/process_criteo_kaggle.py
index 5b9cb4f34..e610e33a6 100644
--- a/examples/data/criteo/process_criteo_kaggle.py
+++ b/examples/data/criteo/process_criteo_kaggle.py
@@ -5,6 +5,12 @@
 target_columns = ['label']
 columns = target_columns + dense_features + category_features
 
+# data_train = pd.read_csv(
+#     'criteo_train_data', sep='\t', names=columns)
+#
+# for col in category_features:
+#     print(col, data_train[col].nunique())
+
 data_train = pd.read_csv(
     'criteo_kaggle_display/train.txt', sep='\t', names=columns)
 
diff --git a/examples/readme.md b/examples/readme.md
index 286b292b1..94643541e 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -209,25 +209,29 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
 
 - MovieLens-1M
 
-  | Model     | Epoch | AUC    |
-  | --------- | ----- | ------ |
-  | Wide&Deep | 1     | 0.8558 |
-  | DeepFM    | 1     | 0.8688 |
-  | DeepFM(Backbone)|1| 0.8876 |
-  | DCN       | 1     | 0.8576 |
-  | AutoInt   | 1     | 0.8513 |
-  | MaskNet   | 1     | 0.8872 |
-  | FibiNet   | 1     | 0.8879 |
+  | Model            | Epoch | AUC    |
+  | ---------------- | ----- | ------ |
+  | Wide&Deep        | 1     | 0.8558 |
+  | DeepFM           | 1     | 0.8688 |
+  | DeepFM(Backbone) | 1     | 0.8876 |
+  | DCN              | 1     | 0.8576 |
+  | AutoInt          | 1     | 0.8513 |
+  | MaskNet          | 1     | 0.8872 |
+  | FibiNet          | 1     | 0.8879 |
 
 - Criteo-Research
 
-  | Model  | Epoch | AUC    |
-  | ------ | ----- | ------ |
-  | FM     | 1     | 0.7577 |
-  | DeepFM | 1     | 0.7967 |
-  | DeepFM(backbone)| 1 | 0.7965 |
-  | DeepFM(periodic)| 1 | 0.7982 |
-  | DeepFM(autodis) | 1 | 0.7983 |
+  | Model             | Epoch | AUC    |
+  | ----------------- | ----- | ------ |
+  | FM                | 1     | 0.7577 |
+  | DeepFM            | 1     | 0.7970 |
+  | DeepFM (backbone) | 1     | 0.7970 |
+  | DeepFM (periodic) | 1     | 0.7980 |
+  | DeepFM (autodis)  | 1     | 0.7979 |
+  | DLRM              | 1     | 0.79785 |
+  | DLRM (backbone)   | 1     | 0.7993 |
+  | DLRM (standard)   | 1     | 0.7949 |
+  | DLRM (autodis)    | 1     | 0.7984 |
 
 ### 召回模型
 

From 5cf7d8f205328a2aa58d38cccbb199ee1502e483 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Sun, 18 Jun 2023 16:57:39 +0800
Subject: [PATCH 34/54] [feat]: add more backbone blocks

---
 easy_rec/python/layers/backbone.py            | 307 ++++-----
 easy_rec/python/layers/common_layers.py       | 176 +-----
 easy_rec/python/layers/fibinet.py             |  54 --
 easy_rec/python/layers/fm.py                  |  43 --
 easy_rec/python/layers/keras/__init__.py      |  12 +
 easy_rec/python/layers/keras/blocks.py        | 117 ++++
 easy_rec/python/layers/{ => keras}/bst.py     |  24 +-
 easy_rec/python/layers/keras/dcn.py           | 154 ++---
 easy_rec/python/layers/{ => keras}/din.py     |  37 +-
 .../python/layers/keras/dot_interaction.py    |  35 +-
 easy_rec/python/layers/keras/fibinet.py       | 229 +++++++
 easy_rec/python/layers/keras/fm.py            |  46 ++
 easy_rec/python/layers/keras/mask_net.py      | 102 +++
 .../layers/{ => keras}/numerical_embedding.py | 100 +--
 easy_rec/python/layers/mask_net.py            | 108 ----
 easy_rec/python/layers/sequence_encoder.py    |   4 +-
 easy_rec/python/layers/utils.py               |  57 ++
 easy_rec/python/model/easy_rec_model.py       | 102 +--
 easy_rec/python/model/rank_model.py           |  20 +-
 easy_rec/python/protos/backbone.proto         |  77 +--
 easy_rec/python/protos/dnn.proto              |  10 +-
 easy_rec/python/protos/easy_rec_model.proto   |   5 +-
 easy_rec/python/protos/feature_config.proto   |   2 +-
 easy_rec/python/protos/fibinet.proto          |  23 -
 easy_rec/python/protos/keras_layer.proto      |  26 +
 easy_rec/python/protos/layer.proto            |  42 +-
 easy_rec/python/protos/masknet.proto          |  17 -
 easy_rec/python/utils/load_class.py           |  20 +-
 easy_rec/python/utils/tf_utils.py             |  65 +-
 .../configs/deepfm_backbone_on_criteo.config  |  26 +-
 .../deepfm_backbone_on_movielens.config       |  72 ++-
 examples/configs/deepfm_on_movielens.config   |   2 +-
 .../configs/dlrm_backbone_on_criteo.config    |  11 +-
 .../dlrm_on_criteo_with_autodis.config        |  17 +-
 .../dlrm_on_criteo_with_periodic.config       | 591 ++++++++++++++++++
 .../configs/dlrm_standard_on_criteo.config    |  11 +-
 examples/configs/fibinet_on_movielens.config  |  37 +-
 examples/configs/masknet_on_movielens.config  |  37 +-
 examples/readme.md                            |  25 +-
 39 files changed, 1798 insertions(+), 1045 deletions(-)
 delete mode 100644 easy_rec/python/layers/fibinet.py
 create mode 100644 easy_rec/python/layers/keras/blocks.py
 rename easy_rec/python/layers/{ => keras}/bst.py (89%)
 rename easy_rec/python/layers/{ => keras}/din.py (67%)
 create mode 100644 easy_rec/python/layers/keras/fibinet.py
 create mode 100644 easy_rec/python/layers/keras/fm.py
 create mode 100644 easy_rec/python/layers/keras/mask_net.py
 rename easy_rec/python/layers/{ => keras}/numerical_embedding.py (64%)
 delete mode 100644 easy_rec/python/layers/mask_net.py
 delete mode 100644 easy_rec/python/protos/fibinet.proto
 create mode 100644 easy_rec/python/protos/keras_layer.proto
 delete mode 100644 easy_rec/python/protos/masknet.proto
 create mode 100644 examples/configs/dlrm_on_criteo_with_periodic.config

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 82d42508c..139e31fee 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -4,36 +4,53 @@
 
 import tensorflow as tf
 
-from easy_rec.python.layers import dnn
-from easy_rec.python.layers.common_layers import Concatenate
 from easy_rec.python.layers.common_layers import EnhancedInputLayer
-from easy_rec.python.layers.common_layers import SENet
-from easy_rec.python.layers.common_layers import highway
-from easy_rec.python.layers.fibinet import FiBiNetLayer
-from easy_rec.python.layers.fm import FMLayer
-from easy_rec.python.layers.mask_net import MaskNet
-from easy_rec.python.layers.numerical_embedding import AutoDisEmbedding
-from easy_rec.python.layers.numerical_embedding import PeriodicEmbedding
+from easy_rec.python.layers.keras import MLP
+from easy_rec.python.layers.utils import Parameter
 from easy_rec.python.protos import backbone_pb2
-from easy_rec.python.protos import layer_pb2
 from easy_rec.python.utils.dag import DAG
-from easy_rec.python.utils.tf_utils import add_op
-from easy_rec.python.utils.tf_utils import dot_op
+from easy_rec.python.utils.load_class import load_keras_layer
+from google.protobuf import struct_pb2
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
 
 
+def block_input(config, block_outputs):
+  inputs = []
+  for input_node in config.inputs:
+    input_name = input_node.name
+    if input_name in block_outputs:
+      input_feature = block_outputs[input_name]
+    else:
+      raise KeyError('input name `%s` does not exists' % input_name)
+    if input_node.HasField('input_fn'):
+      fn = eval(input_node.input_fn)
+      input_feature = fn(input_feature)
+    inputs.append(input_feature)
+
+  if config.merge_inputs_into_list:
+    output = inputs
+  else:
+    output = concat_inputs(inputs, config.input_concat_axis, config.name)
+
+  if config.HasField('extra_input_fn'):
+    fn = eval(config.extra_input_fn)
+    output = fn(output)
+  return output
+
+
 class Backbone(object):
+  """Configurable Backbone Network."""
 
-  def __init__(self, config, model, features, input_layer, l2_reg=None):
-    self._model = model
+  def __init__(self, config, features, input_layer, l2_reg=None):
     self._config = config
     self._features = features
     self._input_layer = input_layer
     self._l2_reg = l2_reg
     self._dag = DAG()
     self._name_to_blocks = {}
+    self.loss_dict = {}
     input_feature_groups = set()
     for block in config.blocks:
       self._dag.add_node(block.name)
@@ -43,6 +60,10 @@ def __init__(self, config, model, features, input_layer, l2_reg=None):
         if len(block.inputs) != 0:
           raise ValueError('no input allowed for input_layer: ' + block.name)
         input_name = block.name
+        if not input_layer.has_group(input_name):
+          raise KeyError(
+              'input_layer\'s name must be one of feature group, invalid: ' +
+              input_name)
         if input_name in input_feature_groups:
           raise ValueError('input `%s` already exists in other block' %
                            input_name)
@@ -72,7 +93,7 @@ def __init__(self, config, model, features, input_layer, l2_reg=None):
             logging.info('adding an input_layer block: ' + input_name)
             new_block = backbone_pb2.Block()
             new_block.name = input_name
-            new_block.input_layer.CopyFrom(layer_pb2.InputLayer())
+            new_block.input_layer.CopyFrom(backbone_pb2.InputLayer())
             self._name_to_blocks[input_name] = new_block
             self._dag.add_node(input_name)
             self._dag.add_edge(input_name, block.name)
@@ -84,30 +105,7 @@ def __init__(self, config, model, features, input_layer, l2_reg=None):
     num_groups = len(input_feature_groups)
     assert num_groups > 0, 'there must be at least one input layer'
 
-  def block_input(self, config, block_outputs, output_list=False):
-    inputs = []
-    for input_node in config.inputs:
-      input_name = input_node.name
-      if input_name in block_outputs:
-        input_feature = block_outputs[input_name]
-      else:
-        raise KeyError('input name `%s` does not exists' % input_name)
-      if input_node.HasField('input_fn'):
-        fn = eval(input_node.input_fn)
-        input_feature = fn(input_feature)
-      inputs.append(input_feature)
-
-    if output_list:
-      output = inputs
-    else:
-      output = concat_inputs(inputs, config.input_concat_axis, config.name)
-
-    if config.HasField('extra_input_fn'):
-      fn = eval(config.extra_input_fn)
-      output = fn(output)
-    return output
-
-  def __call__(self, is_training, *args, **kwargs):
+  def __call__(self, is_training, **kwargs):
     block_outputs = {}
     blocks = self._dag.topological_sort()
     logging.info('backbone topological order: ' + ','.join(blocks))
@@ -116,85 +114,20 @@ def __call__(self, is_training, *args, **kwargs):
       config = self._name_to_blocks[block]
       layer = config.WhichOneof('layer')
       if layer is None:  # identity layer
-        block_outputs[block] = self.block_input(config, block_outputs)
+        block_outputs[block] = block_input(config, block_outputs)
       elif layer == 'input_layer':
         conf = config.input_layer
         input_fn = EnhancedInputLayer(conf, self._input_layer, self._features)
         output = input_fn(block, is_training)
         block_outputs[block] = output
-      elif layer == 'periodic_embedding':
-        input_feature = self.block_input(config, block_outputs)
-        num_emb = PeriodicEmbedding(config.periodic_embedding, scope=block)
-        block_outputs[block] = num_emb(input_feature)
-      elif layer == 'auto_dis_embedding':
-        input_feature = self.block_input(config, block_outputs)
-        num_emb = AutoDisEmbedding(config.auto_dis_embedding, scope=block)
-        block_outputs[block] = num_emb(input_feature)
-      elif layer == 'highway':
-        input_feature = self.block_input(config, block_outputs)
-        conf = config.highway
-        highway_layer = highway(
-            input_feature,
-            conf.emb_size,
-            activation=conf.activation,
-            dropout=conf.dropout_rate,
-            scope=block)
-        block_outputs[block] = highway_layer(input_feature)
-      elif layer == 'mlp':
-        input_feature = self.block_input(config, block_outputs)
-        mlp = dnn.DNN(
-            config.mlp,
-            self._l2_reg,
-            name='%s_mlp' % block,
-            is_training=is_training,
-            last_layer_no_activation=config.mlp.last_layer_no_activation,
-            last_layer_no_batch_norm=config.mlp.last_layer_no_batch_norm)
-        block_outputs[block] = mlp(input_feature)
-      elif layer == 'sequence_encoder':
-        block_outputs[block] = self.sequence_encoder(config, is_training)
-      elif layer == 'masknet':
-        input_feature = self.block_input(config, block_outputs)
-        mask_net = MaskNet(config.masknet, name=block, reuse=tf.AUTO_REUSE)
-        output = mask_net(input_feature, is_training, l2_reg=self._l2_reg)
-        block_outputs[block] = output
-      elif layer == 'senet':
-        input_feature = self.block_input(config, block_outputs)
-        senet = SENet(config.senet, name=block)
-        output = senet(input_feature)
-        block_outputs[block] = output
-      elif layer == 'fibinet':
-        input_feature = self.block_input(config, block_outputs)
-        fibinet = FiBiNetLayer(config.fibinet, name=block)
-        output = fibinet(input_feature, is_training, l2_reg=self._l2_reg)
+      elif layer == 'sequential':
+        inputs = block_input(config, block_outputs)
+        layers = config.sequential.layers
+        output = self.call_sequential_layers(inputs, layers, block, is_training)
         block_outputs[block] = output
-      elif layer == 'fm':
-        input_feature = self.block_input(config, block_outputs)
-        fm = FMLayer(config.fm, name=block)
-        block_outputs[block] = fm(input_feature)
-      elif layer == 'concat':
-        input_feature = self.block_input(config, block_outputs)
-        concat = Concatenate(config.concat)
-        block_outputs[block] = concat(input_feature)
-      elif layer == 'reshape':
-        input_feature = self.block_input(config, block_outputs)
-        block_outputs[block] = tf.reshape(input_feature,
-                                          list(config.reshape.dims))
-      elif layer == 'add':
-        input_feature = self.block_input(
-            config, block_outputs, output_list=True)
-        block_outputs[block] = add_op(input_feature)
-      elif layer == 'dot':
-        input_feature = self.block_input(config, block_outputs)
-        block_outputs[block] = dot_op(input_feature)
-      elif layer == 'Lambda':
-        input_feature = self.block_input(config, block_outputs)
-        fn = eval(config.Lambda.expression)
-        block_outputs[block] = fn(input_feature)
-      # elif layer == 'chain':
-      #   input_feature = self.block_input(config, block_outputs)
-      #   block_outputs[block] = op_chain(input_feature, config.chain.ops)
       else:
-        raise NotImplementedError('Unsupported backbone layer:' + layer)
+        inputs = block_input(config, block_outputs)
+        block_outputs[block] = self.call_layer(inputs, config, block, is_training)
 
     temp = []
     for output in self._config.concat_blocks:
@@ -205,33 +138,52 @@ def __call__(self, is_training, *args, **kwargs):
     output = concat_inputs(temp, msg='backbone')
 
     if self._config.HasField('top_mlp'):
-      no_act = self._config.top_mlp.last_layer_no_activation
-      no_bn = self._config.top_mlp.last_layer_no_batch_norm
-      final_dnn = dnn.DNN(
-          self._config.top_mlp,
-          self._l2_reg,
-          name='backbone_top_mlp',
-          is_training=is_training,
-          last_layer_no_activation=no_act,
-          last_layer_no_batch_norm=no_bn)
-      output = final_dnn(output)
+      params = Parameter.make_from_pb(self._config.top_mlp)
+      params.l2_regularizer = self._l2_reg
+      final_mlp = MLP(params, name='backbone_top_mlp')
+      output = final_mlp(output, training=is_training)
     return output
 
-  def sequence_encoder(self, config, is_training):
-    encodings = []
-    for seq_input in config.inputs:
-      encoding = self._model.get_sequence_encoding(seq_input, is_training)
-      encodings.append(encoding)
-    encoding = concat_inputs(encodings)
-    conf = config.sequence_encoder
-    if conf.HasField('mlp'):
-      sequence_dnn = dnn.DNN(
-          conf.mlp,
-          self._l2_reg,
-          name='%s_seq_dnn' % config.name,
-          is_training=is_training)
-      encoding = sequence_dnn(encoding)
-    return encoding
+  def call_keras_layer(self, layer_conf, inputs, name, training):
+    layer_cls, customize = load_keras_layer(layer_conf.class_name)
+    if layer_cls is None:
+      raise ValueError('Invalid keras layer class name: ' +
+                       layer_conf.class_name)
+
+    param_type = layer_conf.WhichOneof('params')
+    if customize:
+      if param_type is None or param_type == 'st_params':
+        params = Parameter(layer_conf.st_params, True, l2_reg=self._l2_reg)
+      else:
+        pb_params = getattr(layer_conf, param_type)
+        params = Parameter(pb_params, False, l2_reg=self._l2_reg)
+      layer = layer_cls(params, name=name)
+      kwargs = {'loss_dict': self.loss_dict}
+      return layer(inputs, training=training, **kwargs)
+    else:  # internal keras layer
+      if param_type is None:
+        layer = layer_cls(name=name)
+      else:
+        assert param_type == 'st_params', 'internal keras layer only support st_params'
+        kwargs = convert_to_dict(layer_conf.st_params)
+        layer = layer_cls(name=name, **kwargs)
+      return layer(inputs, training=training)
+
+  def call_sequential_layers(self, inputs, layers, name, training):
+   output = inputs
+   for layer in layers:
+     output = self.call_layer(output, layer, name, training)
+   return output
+
+  def call_layer(self, inputs, config, name, training):
+    layer_name = config.WhichOneof('layer')
+    if layer_name == 'keras_layer':
+      return self.call_keras_layer(config.keras_layer, inputs, name, training)
+    if layer_name == 'lambda':
+      conf = getattr(config, 'lambda')
+      fn = eval(conf.expression)
+      return fn(inputs)
+    raise NotImplementedError('Unsupported backbone layer:' + layer_name)
 
 
 def concat_inputs(inputs, axis=-1, msg=''):
@@ -250,66 +202,23 @@ def concat_inputs(inputs, axis=-1, msg=''):
   raise ValueError('no inputs to be concat:' + msg)
 
 
-# def op_chain(inputs, ops):
-#  output = inputs
-#  for op in ops:
-#    op_name = op.WhichOneOf('Op')
-#    output = run_op(output, op_name, op, block='op_chain')
-#  return output
-#
-#
-# def run_op(inputs, op_name, config, block='', is_training=False, l2_reg=None):
-#  if op_name == 'periodic_embedding':
-#    num_emb = PeriodicEmbedding(config.periodic_embedding, scope=block)
-#    return num_emb(inputs)
-#  elif op_name == 'auto_dis_embedding':
-#    num_emb = AutoDisEmbedding(config.auto_dis_embedding, scope=block)
-#    return num_emb(inputs)
-#  elif op_name == 'highway':
-#    conf = config.highway
-#    highway_op_name = highway(
-#      inputs,
-#      conf.emb_size,
-#      activation=conf.activation,
-#      dropout=conf.dropout_rate,
-#      scope=block)
-#    return highway_op_name(inputs)
-#  elif op_name == 'mlp':
-#    mlp = dnn.DNN(
-#      config.mlp,
-#      l2_reg,
-#      name='%s_mlp' % block,
-#      is_training=is_training,
-#      last_layer_no_activation=config.mlp.last_layer_no_activation,
-#      last_layer_no_batch_norm=config.mlp.last_layer_no_batch_norm)
-#    return mlp(inputs)
-#  elif op_name == 'masknet':
-#    mask_net = MaskNet(config.masknet, name=block, reuse=tf.AUTO_REUSE)
-#    output = mask_net(inputs, is_training, l2_reg=l2_reg)
-#    return output
-#  elif op_name == 'senet':
-#    senet = SENet(config.senet, name=block)
-#    output = senet(inputs)
-#    return output
-#  elif op_name == 'fibinet':
-#    fibinet = FiBiNetLayer(config.fibinet, name=block)
-#    output = fibinet(inputs, is_training, l2_reg=l2_reg)
-#    return output
-#  elif op_name == 'fm':
-#    fm = FMLayer(config.fm, name=block)
-#    return fm(inputs)
-#  if op_name == 'Lambda':
-#    fn = eval(config.Lambda.expression)
-#    output = fn(inputs)
-#  elif op_name == 'concat':
-#    concat = Concatenate(config.concat)
-#    output = concat(inputs)
-#  elif op_name == 'reshape':
-#    output = tf.reshape(inputs, list(config.reshape.dims))
-#  elif op_name == 'add':
-#    output = add_op(inputs)
-#  elif op_name == 'dot':
-#    output = dot_op(inputs)
-#  else:
-#    raise NotImplementedError('Unsupported op:' + op_name)
-#  return output
+def format_value(value):
+  value_type = type(value)
+  if value_type in (unicode, str):
+    return str(value)
+  if value_type == float:
+    int_v = int(value)
+    return int_v if int_v == value else value
+  if value_type == struct_pb2.ListValue:
+    return map(format_value, value)
+  if value_type == struct_pb2.Struct:
+    return convert_to_dict(value)
+  return value
+
+
+def convert_to_dict(struct):
+  kwargs = {}
+  for key, value in struct.items():
+    kwargs[str(key)] = format_value(value)
+  return kwargs
+
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index f06723f68..810654cf3 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -1,7 +1,5 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import itertools
-import logging
 
 import six
 import tensorflow as tf
@@ -94,6 +92,11 @@ def __init__(self, config, input_layer, feature_dict):
     self._feature_dict = feature_dict
 
   def __call__(self, group, is_training, *args, **kwargs):
+    if self._config.output_seq_and_normal_feature:
+      seq_features, target_feature, target_features = self._input_layer(
+        self._feature_dict, group, is_combine=False)
+      return seq_features, target_features
+
     features, feature_list = self._input_layer(self._feature_dict, group)
     num_features = len(feature_list)
 
@@ -155,172 +158,3 @@ def __call__(self, inputs, *args, **kwargs):
       dim = self.config.expand_dim_after
       output = tf.expand_dims(output, dim)
     return output
-
-
-class SENet(object):
-  """SENet+ Layer used in FiBiNET，支持不同field的embedding dimension不等.
-
-  arxiv: 2209.05016
-  """
-
-  def __init__(self, config, name='SENet'):
-    self.config = config
-    self.name = name
-
-  def __call__(self, embedding_list):
-    """embedding_list:  - A list of 2D tensor with shape: ``(batch_size,embedding_size)``."""
-    print('SENET layer with %d inputs' % len(embedding_list))
-    g = self.config.num_squeeze_group
-    for emb in embedding_list:
-      assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
-      dim = int(emb.shape[-1])
-      assert dim >= g and dim % g == 0, 'field embedding dimension %d must be divisible by %d' % (
-          dim, g)
-
-    field_size = len(embedding_list)
-    feature_size_list = [emb.shape.as_list()[-1] for emb in embedding_list]
-
-    # Squeeze
-    # embedding dimension 必须能被 g 整除
-    group_embs = [
-        tf.reshape(emb, [-1, g, int(emb.shape[-1]) // g])
-        for emb in embedding_list
-    ]
-
-    squeezed = []
-    for emb in group_embs:
-      squeezed.append(tf.reduce_max(emb, axis=-1))  # [B, g]
-      squeezed.append(tf.reduce_mean(emb, axis=-1))  # [B, g]
-    z = tf.concat(squeezed, axis=1)  # [bs, field_size * num_groups * 2]
-
-    # Excitation
-    r = self.config.reduction_ratio
-    reduction_size = max(1, field_size * g * 2 // r)
-
-    initializer = tf.glorot_normal_initializer()
-    a1 = tf.layers.dense(
-        z,
-        reduction_size,
-        kernel_initializer=initializer,
-        activation=tf.nn.relu,
-        name='%s/W1' % self.name)
-    weights = tf.layers.dense(
-        a1,
-        sum(feature_size_list),
-        kernel_initializer=initializer,
-        name='%s/W2' % self.name)
-
-    # Re-weight
-    inputs = tf.concat(embedding_list, axis=-1)
-    output = inputs * weights
-
-    # Fuse, add skip-connection
-    if self.config.use_skip_connection:
-      output += inputs
-
-    # Layer Normalization
-    if self.config.use_output_layer_norm:
-      output = layer_norm(output)
-    return output
-
-
-def _full_interaction(v_i, v_j):
-  # [bs, 1, dim] x [bs, dim, 1] = [bs, 1]
-  interaction = tf.matmul(
-      tf.expand_dims(v_i, axis=1), tf.expand_dims(v_j, axis=-1))
-  return tf.squeeze(interaction, axis=1)
-
-
-class BiLinear(object):
-
-  def __init__(self,
-               output_size,
-               bilinear_type,
-               bilinear_plus=True,
-               name='bilinear'):
-    """双线性特征交互层，支持不同field embeddings的size不等.
-
-    arxiv: 2209.05016
-    :param output_size: 输出的size
-    :param bilinear_type: ['all', 'each', 'interaction']，支持其中一种
-    :param bilinear_plus: 是否使用bi-linear+
-    """
-    self.name = name
-    self.bilinear_type = bilinear_type.lower()
-    self.output_size = output_size
-
-    if bilinear_type not in ['all', 'each', 'interaction']:
-      raise NotImplementedError(
-          "bilinear_type only support: ['all', 'each', 'interaction']")
-
-    if bilinear_plus:
-      self.func = _full_interaction
-    else:
-      self.func = tf.multiply
-
-  def __call__(self, embeddings):
-    print('Bilinear Layer with %d inputs' % len(embeddings))
-    if len(embeddings) > 200:
-      logging.warn('There are too many inputs for bilinear layer: %d' %
-                   len(embeddings))
-    equal_dim = True
-    _dim = embeddings[0].shape[-1]
-    for emb in embeddings:
-      assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
-      if emb.shape[-1] != _dim:
-        equal_dim = False
-    if not equal_dim and self.bilinear_type != 'interaction':
-      raise ValueError(
-          'all embedding dimensions must be same when not use bilinear type: interaction'
-      )
-    dim = int(_dim)
-
-    field_size = len(embeddings)
-    initializer = tf.glorot_normal_initializer()
-
-    # bi-linear+: p的维度为[bs, f*(f-1)/2]
-    # bi-linear:
-    # 当equal_dim=True时，p的维度为[bs, f*(f-1)/2*k]，k为embeddings的size
-    # 当equal_dim=False时，p的维度为[bs, (k_2+k_3+...+k_f)+...+(k_i+k_{i+1}+...+k_f)+...+k_f]，
-    # 其中 k_i为第i个field的embedding的size
-    if self.bilinear_type == 'all':
-      v_dot = [
-          tf.layers.dense(
-              v_i,
-              dim,
-              kernel_initializer=initializer,
-              name='%s/all' % self.name,
-              reuse=tf.AUTO_REUSE) for v_i in embeddings[:-1]
-      ]
-      p = [
-          self.func(v_dot[i], embeddings[j])
-          for i, j in itertools.combinations(range(field_size), 2)
-      ]
-    elif self.bilinear_type == 'each':
-      v_dot = [
-          tf.layers.dense(
-              v_i,
-              dim,
-              kernel_initializer=initializer,
-              name='%s/each_%d' % (self.name, i),
-              reuse=tf.AUTO_REUSE) for i, v_i in enumerate(embeddings[:-1])
-      ]
-      p = [
-          self.func(v_dot[i], embeddings[j])
-          for i, j in itertools.combinations(range(field_size), 2)
-      ]
-    else:  # interaction
-      p = [
-          self.func(
-              tf.layers.dense(
-                  embeddings[i],
-                  embeddings[j].shape.as_list()[-1],
-                  kernel_initializer=initializer,
-                  name='%s/interaction_%d_%d' % (self.name, i, j),
-                  reuse=tf.AUTO_REUSE), embeddings[j])
-          for i, j in itertools.combinations(range(field_size), 2)
-      ]
-
-    output = tf.layers.dense(
-        tf.concat(p, axis=-1), self.output_size, kernel_initializer=initializer)
-    return output
diff --git a/easy_rec/python/layers/fibinet.py b/easy_rec/python/layers/fibinet.py
deleted file mode 100644
index 77b6da4a5..000000000
--- a/easy_rec/python/layers/fibinet.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# -*- encoding:utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import tensorflow as tf
-
-from easy_rec.python.layers import dnn
-from easy_rec.python.layers.common_layers import BiLinear
-from easy_rec.python.layers.common_layers import SENet
-
-if tf.__version__ >= '2.0':
-  tf = tf.compat.v1
-
-
-class FiBiNetLayer(object):
-  """FiBiNet++:Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction.
-
-  This is almost an exact implementation of the original FiBiNet++ model.
-  See the original paper:
-  https://arxiv.org/pdf/2209.05016.pdf
-  """
-
-  def __init__(self, fibinet_config, name='fibinet'):
-    self._config = fibinet_config
-    self.name = name
-
-  def __call__(self, inputs, is_training, l2_reg=None, *args, **kwargs):
-    feature_list = []
-
-    senet = SENet(self._config.senet, name='%s_senet' % self.name)
-    senet_output = senet(inputs)
-    feature_list.append(senet_output)
-
-    if self._config.HasField('bilinear'):
-      conf = self._config.bilinear
-      bilinear = BiLinear(
-          output_size=conf.num_output_units,
-          bilinear_type=conf.type,
-          bilinear_plus=conf.use_plus,
-          name='%s_bilinear' % self.name)
-      bilinear_output = bilinear(inputs)
-      feature_list.append(bilinear_output)
-
-    if len(feature_list) > 1:
-      feature = tf.concat(feature_list, axis=-1)
-    else:
-      feature = feature_list[0]
-
-    if self._config.HasField('mlp'):
-      final_dnn = dnn.DNN(
-          self._config.mlp,
-          l2_reg,
-          name='%s_fibinet_mlp' % self.name,
-          is_training=is_training)
-      feature = final_dnn(feature)
-    return feature
diff --git a/easy_rec/python/layers/fm.py b/easy_rec/python/layers/fm.py
index 7b0742f6d..1929e00aa 100644
--- a/easy_rec/python/layers/fm.py
+++ b/easy_rec/python/layers/fm.py
@@ -24,46 +24,3 @@ def __call__(self, fm_fea):
       square_sum = tf.reduce_sum(tf.square(fm_feas), 1)
       y_v = 0.5 * tf.subtract(sum_square, square_sum)
     return y_v
-
-
-class FMLayer(object):
-  """Factorization Machine models pairwise (order-2) feature interactions without linear term and bias.
-
-  References
-    - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
-  """
-
-  def __init__(self, config, name='fm'):
-    self.name = name
-    self.config = config
-
-  def __call__(self, inputs):
-    """FM layer.
-
-    Input shape.
-      - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
-      - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
-    Output shape
-      - 2D tensor with shape: ``(batch_size, 1)``.
-    """
-    if type(inputs) == list:
-      emb_dims = set(map(lambda x: int(x.shape[-1]), inputs))
-      if len(emb_dims) != 1:
-        dims = ','.join([str(d) for d in emb_dims])
-        raise ValueError('all embedding dim must be equal in FM layer:' + dims)
-
-      with tf.name_scope(self.name):
-        fea = tf.stack(inputs, axis=1)
-    else:
-      assert inputs.shape.ndims == 3, 'input of FM layer must be a 3D tensor or a list of 2D tensors'
-      fea = inputs
-
-    with tf.name_scope(self.name):
-      square_of_sum = tf.square(tf.reduce_sum(fea, axis=1))
-      sum_of_square = tf.reduce_sum(tf.square(fea), axis=1)
-      cross_term = tf.subtract(square_of_sum, sum_of_square)
-      if self.config.use_variant:
-        cross_term = 0.5 * cross_term
-      else:
-        cross_term = 0.5 * tf.reduce_sum(cross_term, axis=-1)
-    return cross_term
diff --git a/easy_rec/python/layers/keras/__init__.py b/easy_rec/python/layers/keras/__init__.py
index c4006b39c..d0dda33cf 100644
--- a/easy_rec/python/layers/keras/__init__.py
+++ b/easy_rec/python/layers/keras/__init__.py
@@ -1 +1,13 @@
+from .blocks import MLP, Highway
+from .bst import BST
+from .din import DIN
+from .dcn import Cross
 from .dot_interaction import DotInteraction
+from .fibinet import BiLinear
+from .fibinet import FiBiNet
+from .fibinet import SENet
+from .fm import FM
+from .mask_net import MaskBlock
+from .mask_net import MaskNet
+from .numerical_embedding import AutoDisEmbedding
+from .numerical_embedding import PeriodicEmbedding
diff --git a/easy_rec/python/layers/keras/blocks.py b/easy_rec/python/layers/keras/blocks.py
new file mode 100644
index 000000000..507723017
--- /dev/null
+++ b/easy_rec/python/layers/keras/blocks.py
@@ -0,0 +1,117 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""Convenience blocks for building models."""
+import logging
+from easy_rec.python.utils.activation import get_activation
+import tensorflow as tf
+
+
+class MLP(tf.keras.layers.Layer):
+  """Sequential multi-layer perceptron (MLP) block.
+
+  Attributes:
+    units: Sequential list of layer sizes.
+    use_bias: Whether to include a bias term.
+    activation: Type of activation to use on all except the last layer.
+    final_activation: Type of activation to use on last layer.
+    **kwargs: Extra args passed to the Keras Layer base class.
+  """
+
+  def __init__(self, params, name='mlp', **kwargs):
+    super(MLP, self).__init__(name=name, **kwargs)
+    params.check_required('hidden_units')
+    use_bn = params.get_or_default('use_bn', True)
+    use_final_bn = params.get_or_default('use_final_bn', True)
+    use_bias = params.get_or_default('use_bias', True)
+    dropout_rate = list(params.get_or_default('dropout_ratio', []))
+    activation = params.get_or_default('activation', 'relu')
+    initializer = params.get_or_default('initializer', 'he_uniform')
+    final_activation = params.get_or_default('final_activation', None)
+    use_bn_after_act = params.get_or_default('use_bn_after_activation', False)
+    units = list(params.hidden_units)
+    logging.info(
+        'MLP(%s) units: %s, dropout: %r, activate=%s, use_bn=%r, final_bn=%r,'
+        ' final_activate=%s, bias=%r, initializer=%s, bn_after_activation=%r'
+        % (name, units, dropout_rate, activation, use_bn, use_final_bn,
+           final_activation, use_bias, initializer, use_bn_after_act))
+
+    num_dropout = len(dropout_rate)
+    self._sub_layers = []
+    for i, num_units in enumerate(units[:-1]):
+      name = 'dnn_%d' % i
+      drop_rate = dropout_rate[i] if i < num_dropout else 0.0
+      self.add_rich_layer(num_units, use_bn, drop_rate, activation, initializer,
+                          use_bias, use_bn_after_act, name, params.l2_regularizer)
+
+    n = len(units) - 1
+    drop_rate = dropout_rate[n] if num_dropout > n else 0.0
+    name = 'dnn_%d' % n
+    self.add_rich_layer(units[-1], use_final_bn, drop_rate, final_activation,
+                        initializer, use_bias, use_bn_after_act, name, params.l2_regularizer)
+
+  def add_rich_layer(self,
+                     num_units,
+                     use_bn,
+                     dropout_rate,
+                     activation,
+                     initializer,
+                     use_bias=True,
+                     use_bn_after_activation=False,
+                     name='mlp',
+                     l2_reg=None):
+    act_fn = get_activation(activation)
+    if use_bn and not use_bn_after_activation:
+      dense = tf.keras.layers.Dense(
+          units=num_units,
+          use_bias=use_bias,
+          kernel_initializer=initializer,
+          kernel_regularizer=l2_reg,
+          name=name)
+      self._sub_layers.append(dense)
+      # bn = tf.keras.layers.BatchNormalization(name='%s/bn' % name)
+      # keras BN layer have a stale issue on some versions of tf
+      bn = lambda x, training: tf.layers.batch_normalization(x, training=training, name='%s/bn' % name)
+      self._sub_layers.append(bn)
+      act = tf.keras.layers.Activation(act_fn, name='%s/act' % name)
+      self._sub_layers.append(act)
+    else:
+      dense = tf.keras.layers.Dense(
+          num_units,
+          activation=act_fn,
+          use_bias=use_bias,
+          kernel_initializer=initializer,
+          kernel_regularizer=l2_reg,
+          name=name)
+      self._sub_layers.append(dense)
+      if use_bn and use_bn_after_activation:
+        bn = lambda x, training: tf.layers.batch_normalization(x, training=training, name='%s/bn' % name)
+        self._sub_layers.append(bn)
+
+    if 0.0 < dropout_rate < 1.0:
+      dropout = tf.keras.layers.Dropout(dropout_rate, name='%s/dropout' % name)
+      self._sub_layers.append(dropout)
+    elif dropout_rate >= 1.0:
+      raise ValueError('invalid dropout_ratio: %.3f' % dropout_rate)
+
+  def call(self, x, training=None, **kwargs):
+    """Performs the forward computation of the block."""
+    for layer in self._sub_layers:
+      x = layer(x, training=training)
+    return x
+
+
+class Highway(tf.keras.layers.Layer):
+  def __init__(self, params, name='highway', **kwargs):
+    super(Highway, self).__init__(name, **kwargs)
+    params.check_required('emb_size')
+    self.emb_size = params.emb_size
+    self.num_layers = params.get_or_default('num_layers', 1)
+    self.activation = params.get_or_default('activation', 'gelu')
+    self.dropout_rate = params.get_or_default('dropout_rate', 0.0)
+
+  def call(self, inputs, training=None, **kwargs):
+    from easy_rec.python.layers.common_layers import highway
+    return highway(inputs, self.emb_size,
+                   activation=self.activation,
+                   num_layers=self.num_layers,
+                   dropout=self.dropout_rate if training else 0.0)
diff --git a/easy_rec/python/layers/bst.py b/easy_rec/python/layers/keras/bst.py
similarity index 89%
rename from easy_rec/python/layers/bst.py
rename to easy_rec/python/layers/keras/bst.py
index 9f2f78030..9492fda07 100644
--- a/easy_rec/python/layers/bst.py
+++ b/easy_rec/python/layers/keras/bst.py
@@ -7,17 +7,15 @@
 from easy_rec.python.loss.nce_loss import nce_loss
 from easy_rec.python.utils.activation import get_activation
 from easy_rec.python.utils.shape_utils import get_shape_list
+from tensorflow.python.keras.layers import Layer
 
-# from tensorflow.python.keras.layers import Layer
 
+class BST(Layer):
 
-class BST(object):
-
-  def __init__(self, config, l2_reg, name='bst', **kwargs):
-    # super(BST, self).__init__(name=name, **kwargs)
-    self.name = name
+  def __init__(self, params, name='bst', l2_reg=None, **kwargs):
+    super(BST, self).__init__(name=name, **kwargs)
     self.l2_reg = l2_reg
-    self.config = config
+    self.config = params.get_pb_config()
 
   def encode(self, seq_input, max_position):
     seq_fea = multihead_cross_attention.embedding_postprocessor(
@@ -44,15 +42,16 @@ def encode(self, seq_input, max_position):
         hidden_dropout_prob=self.config.hidden_dropout_prob,
         attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
         initializer_range=self.config.initializer_range,
-        name=self.name + '/bst',
+        name=self.name + '/transformer',
         reuse=tf.AUTO_REUSE)
     # attention_fea shape: [batch_size, seq_length, hidden_size]
     out_fea = attention_fea[:, 0, :]  # target feature
     print('bst output shape:', out_fea.shape)
     return out_fea
 
-  def __call__(self, inputs, training=None, **kwargs):
-    seq_features, target_feature = inputs
+  def call(self, inputs, training=None, **kwargs):
+    seq_features, target_features = inputs
+    assert len(seq_features) > 0, '[%s] sequence feature is empty' % self.name
     if not training:
       self.config.hidden_dropout_prob = 0.0
       self.config.attention_probs_dropout_prob = 0.0
@@ -70,7 +69,7 @@ def __call__(self, inputs, training=None, **kwargs):
     with tf.control_dependencies([valid_len]):
       # seq_input: [batch_size, seq_len, embed_size]
       seq_input = tf.concat(seq_embeds, axis=-1)
-    if target_feature is not None:
+    if len(target_features) > 0:
       max_position += 1
 
     seq_embed_size = seq_input.shape.as_list()[-1]
@@ -97,7 +96,8 @@ def __call__(self, inputs, training=None, **kwargs):
       loss_dict['%s_contrastive_loss' % self.name] = loss
       # tf.summary.scalar('loss/%s_contrastive_loss' % self.name, loss)
 
-    if target_feature is not None:
+    if len(target_features) > 0:
+      target_feature = tf.concat(target_features, axis=-1)
       target_size = target_feature.shape.as_list()[-1]
       assert seq_embed_size == target_size, 'the embedding size of sequence and target item is not equal' \
                                             ' in feature group:' + self.name
diff --git a/easy_rec/python/layers/keras/dcn.py b/easy_rec/python/layers/keras/dcn.py
index 2f35bdc5d..5fe4d4c42 100644
--- a/easy_rec/python/layers/keras/dcn.py
+++ b/easy_rec/python/layers/keras/dcn.py
@@ -8,78 +8,84 @@
 class Cross(tf.keras.layers.Layer):
   """Cross Layer in Deep & Cross Network to learn explicit feature interactions.
 
-    A layer that creates explicit and bounded-degree feature interactions
-    efficiently. The `call` method accepts `inputs` as a tuple of size 2
-    tensors. The first input `x0` is the base layer that contains the original
-    features (usually the embedding layer); the second input `xi` is the output
-    of the previous `Cross` layer in the stack, i.e., the i-th `Cross`
-    layer. For the first `Cross` layer in the stack, x0 = xi.
-
-    The output is x_{i+1} = x0 .* (W * xi + bias + diag_scale * xi) + xi,
-    where .* designates elementwise multiplication, W could be a full-rank
-    matrix, or a low-rank matrix U*V to reduce the computational cost, and
-    diag_scale increases the diagonal of W to improve training stability (
-    especially for the low-rank case).
-
-    References:
-        1. [R. Wang et al.](https://arxiv.org/pdf/2008.13535.pdf)
-          See Eq. (1) for full-rank and Eq. (2) for low-rank version.
-        2. [R. Wang et al.](https://arxiv.org/pdf/1708.05123.pdf)
-
-    Example:
-
-        ```python
-        # after embedding layer in a functional model:
-        input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64)
-        x0 = tf.keras.layers.Embedding(input_dim=32, output_dim=6)
-        x1 = Cross()(x0, x0)
-        x2 = Cross()(x0, x1)
-        logits = tf.keras.layers.Dense(units=10)(x2)
-        model = tf.keras.Model(input, logits)
-        ```
-
-    Args:
-        projection_dim: project dimension to reduce the computational cost.
-          Default is `None` such that a full (`input_dim` by `input_dim`) matrix
-          W is used. If enabled, a low-rank matrix W = U*V will be used, where U
-          is of size `input_dim` by `projection_dim` and V is of size
-          `projection_dim` by `input_dim`. `projection_dim` need to be smaller
-          than `input_dim`/2 to improve the model efficiency. In practice, we've
-          observed that `projection_dim` = d/4 consistently preserved the
-          accuracy of a full-rank version.
-        diag_scale: a non-negative float used to increase the diagonal of the
-          kernel W by `diag_scale`, that is, W + diag_scale * I, where I is an
-          identity matrix.
-        use_bias: whether to add a bias term for this layer. If set to False,
-          no bias term will be used.
-        preactivation: Activation applied to output matrix of the layer, before
-          multiplication with the input. Can be used to control the scale of the
-          layer's outputs and improve stability.
-        kernel_initializer: Initializer to use on the kernel matrix.
-        bias_initializer: Initializer to use on the bias vector.
-        kernel_regularizer: Regularizer to use on the kernel matrix.
-        bias_regularizer: Regularizer to use on bias vector.
-
-    Input shape: A tuple of 2 (batch_size, `input_dim`) dimensional inputs.
-    Output shape: A single (batch_size, `input_dim`) dimensional output.
+  A layer that creates explicit and bounded-degree feature interactions
+  efficiently. The `call` method accepts `inputs` as a tuple of size 2
+  tensors. The first input `x0` is the base layer that contains the original
+  features (usually the embedding layer); the second input `xi` is the output
+  of the previous `Cross` layer in the stack, i.e., the i-th `Cross`
+  layer. For the first `Cross` layer in the stack, x0 = xi.
+
+  The output is x_{i+1} = x0 .* (W * xi + bias + diag_scale * xi) + xi,
+  where .* designates elementwise multiplication, W could be a full-rank
+  matrix, or a low-rank matrix U*V to reduce the computational cost, and
+  diag_scale increases the diagonal of W to improve training stability (
+  especially for the low-rank case).
+
+  References:
+      1. [R. Wang et al.](https://arxiv.org/pdf/2008.13535.pdf)
+        See Eq. (1) for full-rank and Eq. (2) for low-rank version.
+      2. [R. Wang et al.](https://arxiv.org/pdf/1708.05123.pdf)
+
+  Example:
+
+      ```python
+      # after embedding layer in a functional model:
+      input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64)
+      x0 = tf.keras.layers.Embedding(input_dim=32, output_dim=6)
+      x1 = Cross()(x0, x0)
+      x2 = Cross()(x0, x1)
+      logits = tf.keras.layers.Dense(units=10)(x2)
+      model = tf.keras.Model(input, logits)
+      ```
+
+  Args:
+      projection_dim: project dimension to reduce the computational cost.
+        Default is `None` such that a full (`input_dim` by `input_dim`) matrix
+        W is used. If enabled, a low-rank matrix W = U*V will be used, where U
+        is of size `input_dim` by `projection_dim` and V is of size
+        `projection_dim` by `input_dim`. `projection_dim` need to be smaller
+        than `input_dim`/2 to improve the model efficiency. In practice, we've
+        observed that `projection_dim` = d/4 consistently preserved the
+        accuracy of a full-rank version.
+      diag_scale: a non-negative float used to increase the diagonal of the
+        kernel W by `diag_scale`, that is, W + diag_scale * I, where I is an
+        identity matrix.
+      use_bias: whether to add a bias term for this layer. If set to False,
+        no bias term will be used.
+      preactivation: Activation applied to output matrix of the layer, before
+        multiplication with the input. Can be used to control the scale of the
+        layer's outputs and improve stability.
+      kernel_initializer: Initializer to use on the kernel matrix.
+      bias_initializer: Initializer to use on the bias vector.
+      kernel_regularizer: Regularizer to use on the kernel matrix.
+      bias_regularizer: Regularizer to use on bias vector.
+
+  Input shape: A tuple of 2 (batch_size, `input_dim`) dimensional inputs.
+  Output shape: A single (batch_size, `input_dim`) dimensional output.
   """
 
-  def __init__(self, config, **kwargs):
+  def __init__(self, params, **kwargs):
     super(Cross, self).__init__(**kwargs)
-    self._projection_dim = config.projection_dim
-    self._diag_scale = config.diag_scale
-    self._use_bias = config.use_bias
-    self._preactivation = tf.keras.activations.get(config.preactivation)
-    self._kernel_initializer = tf.keras.initializers.get(config.kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(config.bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(config.kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(config.bias_regularizer)
+    self._projection_dim = params.get_or_default('projection_dim', None)
+    self._diag_scale = params.get_or_default('diag_scale', 0.0)
+    self._use_bias = params.get_or_default('use_bias', True)
+    preactivation = params.get_or_default('preactivation', None)
+    self._preactivation = tf.keras.activations.get(preactivation)
+    kernel_initializer = params.get_or_default('kernel_initializer',
+                                               'truncated_normal')
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    bias_initializer = params.get_or_default('bias_initializer', 'zeros')
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    kernel_regularizer = params.get_or_default('kernel_regularizer', None)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    bias_regularizer = params.get_or_default('bias_regularizer', None)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
     self._input_dim = None
     self._supports_masking = True
 
     if self._diag_scale < 0:  # pytype: disable=unsupported-operands
       raise ValueError(
-          "`diag_scale` should be non-negative. Got `diag_scale` = {}".format(
+          '`diag_scale` should be non-negative. Got `diag_scale` = {}'.format(
               self._diag_scale))
 
   def build(self, input_shape):
@@ -139,8 +145,8 @@ def call(self, inputs, **kwargs):
 
     if x0.shape[-1] != x.shape[-1]:
       raise ValueError(
-          "`x0` and `x` dimension mismatch! Got `x0` dimension {}, and x "
-          "dimension {}. This case is not supported yet.".format(
+          '`x0` and `x` dimension mismatch! Got `x0` dimension {}, and x '
+          'dimension {}. This case is not supported yet.'.format(
               x0.shape[-1], x.shape[-1]))
 
     if self._projection_dim is None:
@@ -157,21 +163,21 @@ def call(self, inputs, **kwargs):
 
   def get_config(self):
     config = {
-        "projection_dim":
+        'projection_dim':
             self._projection_dim,
-        "diag_scale":
+        'diag_scale':
             self._diag_scale,
-        "use_bias":
+        'use_bias':
             self._use_bias,
-        "preactivation":
+        'preactivation':
             tf.keras.activations.serialize(self._preactivation),
-        "kernel_initializer":
+        'kernel_initializer':
             tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
+        'bias_initializer':
             tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
+        'kernel_regularizer':
             tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
+        'bias_regularizer':
             tf.keras.regularizers.serialize(self._bias_regularizer),
     }
     base_config = super(Cross, self).get_config()
diff --git a/easy_rec/python/layers/din.py b/easy_rec/python/layers/keras/din.py
similarity index 67%
rename from easy_rec/python/layers/din.py
rename to easy_rec/python/layers/keras/din.py
index 18505bd44..686d23e00 100644
--- a/easy_rec/python/layers/din.py
+++ b/easy_rec/python/layers/keras/din.py
@@ -7,32 +7,33 @@
 from easy_rec.python.layers import dnn
 from easy_rec.python.utils.shape_utils import get_shape_list
 
-# from tensorflow.python.keras.layers import Layer
+from tensorflow.python.keras.layers import Layer
 
 
-class DIN(object):
+class DIN(Layer):
 
-  def __init__(self, config, l2_reg, name='din', **kwargs):
-    # super(DIN, self).__init__(name=name, **kwargs)
-    self.name = name
+  def __init__(self, params, name='din', l2_reg=None, **kwargs):
+    super(DIN, self).__init__(name=name, **kwargs)
     self.l2_reg = l2_reg
-    self.config = config
+    self.config = params.get_pb_config()
 
-  def __call__(self, inputs, training=None, **kwargs):
-    seq_features, target_feature = inputs
+  def call(self, inputs, training=None, **kwargs):
+    seq_features, target_features = inputs
+    assert len(seq_features) > 0, '[%s] sequence feature is empty' % self.name
+    assert len(target_features) > 0, '[%s] target feature is empty' % self.name
+
+    query = tf.concat(target_features, axis=-1)
     seq_input = [seq_fea for seq_fea, _ in seq_features]
     keys = tf.concat(seq_input, axis=-1)
 
-    query = target_feature
-    target_emb_size = target_feature.shape.as_list()[-1]
+    query_emb_size = int(query.shape[-1])
     seq_emb_size = keys.shape.as_list()[-1]
-    if target_emb_size != seq_emb_size:
+    if query_emb_size != seq_emb_size:
       logging.info(
           '<din> the embedding size of sequence [%d] and target item [%d] is not equal'
-          ' in feature group: %s', seq_emb_size, target_emb_size, self.name)
-      if target_emb_size < seq_emb_size:
-        query = tf.pad(target_feature,
-                       [[0, 0], [0, seq_emb_size - target_emb_size]])
+          ' in feature group: %s', seq_emb_size, query_emb_size, self.name)
+      if query_emb_size < seq_emb_size:
+        query = tf.pad(query, [[0, 0], [0, seq_emb_size - query_emb_size]])
       else:
         assert False, 'the embedding size of target item is larger than the one of sequence'
 
@@ -64,10 +65,10 @@ def __call__(self, inputs, training=None, **kwargs):
       raise ValueError('unsupported attention normalizer: ' +
                        self.config.attention_normalizer)
 
-    if target_emb_size < seq_emb_size:
-      keys = keys[:, :, :target_emb_size]  # [B, L, E]
+    if query_emb_size < seq_emb_size:
+      keys = keys[:, :, :query_emb_size]  # [B, L, E]
     output = tf.squeeze(tf.matmul(scores, keys), axis=[1])
     if self.config.need_target_feature:
-      output = tf.concat([output, target_feature], axis=-1)
+      output = tf.concat([output, query], axis=-1)
     print('din output shape:', output.shape)
     return output
diff --git a/easy_rec/python/layers/keras/dot_interaction.py b/easy_rec/python/layers/keras/dot_interaction.py
index 50a3966af..7ec47c5ad 100644
--- a/easy_rec/python/layers/keras/dot_interaction.py
+++ b/easy_rec/python/layers/keras/dot_interaction.py
@@ -27,14 +27,9 @@ class DotInteraction(tf.keras.layers.Layer):
     name: String name of the layer.
   """
 
-  def __init__(self,
-               config,
-               self_interaction=False,
-               skip_gather=False,
-               name=None,
-               **kwargs):
-    self._self_interaction = config.self_interaction
-    self._skip_gather = config.skip_gather
+  def __init__(self, params, name=None, **kwargs):
+    self._self_interaction = params.get_or_default('self_interaction', False)
+    self._skip_gather = params.get_or_default('skip_gather', False)
     super(DotInteraction, self).__init__(name=name, **kwargs)
 
   def call(self, inputs, **kwargs):
@@ -53,20 +48,22 @@ def call(self, inputs, **kwargs):
       `num_features * (num_features + 1) / 2` if self_interaction is True and
       `num_features * (num_features - 1) / 2` if self_interaction is False.
     """
-    num_features = len(inputs)
-    batch_size = tf.shape(inputs[0])[0]
-    feature_dim = tf.shape(inputs[0])[1]
-    # concat_features shape: batch_size, num_features, feature_dim
-    try:
-      concat_features = tf.concat(inputs, axis=-1)
-      concat_features = tf.reshape(concat_features,
-                                   [batch_size, -1, feature_dim])
-    except (ValueError, tf.errors.InvalidArgumentError) as e:
-      raise ValueError('Input tensors` dimensions must be equal, original'
-                       'error message: {}'.format(e))
+    if isinstance(inputs, (list, tuple)):
+      # concat_features shape: batch_size, num_features, feature_dim
+      try:
+        concat_features = tf.stack(inputs, axis=1)
+      except (ValueError, tf.errors.InvalidArgumentError) as e:
+        raise ValueError('Input tensors` dimensions must be equal, original'
+                         'error message: {}'.format(e))
+    else:
+      assert inputs.shape.ndims == 3, 'input of dot func must be a 3D tensor or a list of 2D tensors'
+      concat_features = inputs
+
+    batch_size = tf.shape(concat_features)[0]
 
     # Interact features, select lower-triangular portion, and re-shape.
     xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
+    num_features = xactions.shape[-1]
     ones = tf.ones_like(xactions)
     if self._self_interaction:
       # Selecting lower-triangular portion including the diagonal.
diff --git a/easy_rec/python/layers/keras/fibinet.py b/easy_rec/python/layers/keras/fibinet.py
new file mode 100644
index 000000000..dc1f7d003
--- /dev/null
+++ b/easy_rec/python/layers/keras/fibinet.py
@@ -0,0 +1,229 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import itertools
+import logging
+
+import tensorflow as tf
+
+from easy_rec.python.layers import dnn
+from easy_rec.python.layers.common_layers import layer_norm
+from easy_rec.python.layers.keras.blocks import MLP
+from easy_rec.python.layers.utils import Parameter
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class SENet(tf.keras.layers.Layer):
+  """SENet+ Layer used in FiBiNET，支持不同field的embedding dimension不等.
+
+  arxiv: 2209.05016
+  """
+
+  def __init__(self, params, name='SENet', **kwargs):
+    super(SENet, self).__init__(name, **kwargs)
+    self.config = params.get_pb_config()
+
+  def call(self, inputs, **kwargs):
+    """embedding_list:  - A list of 2D tensor with shape: ``(batch_size,embedding_size)``."""
+    print('SENET layer with %d inputs' % len(inputs))
+    g = self.config.num_squeeze_group
+    for emb in inputs:
+      assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
+      dim = int(emb.shape[-1])
+      assert dim >= g and dim % g == 0, 'field embedding dimension %d must be divisible by %d' % (
+          dim, g)
+
+    field_size = len(inputs)
+    feature_size_list = [emb.shape.as_list()[-1] for emb in inputs]
+
+    # Squeeze
+    # embedding dimension 必须能被 g 整除
+    group_embs = [
+        tf.reshape(emb, [-1, g, int(emb.shape[-1]) // g]) for emb in inputs
+    ]
+
+    squeezed = []
+    for emb in group_embs:
+      squeezed.append(tf.reduce_max(emb, axis=-1))  # [B, g]
+      squeezed.append(tf.reduce_mean(emb, axis=-1))  # [B, g]
+    z = tf.concat(squeezed, axis=1)  # [bs, field_size * num_groups * 2]
+
+    # Excitation
+    r = self.config.reduction_ratio
+    reduction_size = max(1, field_size * g * 2 // r)
+
+    initializer = tf.glorot_normal_initializer()
+    a1 = tf.layers.dense(
+        z,
+        reduction_size,
+        kernel_initializer=initializer,
+        activation=tf.nn.relu,
+        name='%s/W1' % self.name)
+    weights = tf.layers.dense(
+        a1,
+        sum(feature_size_list),
+        kernel_initializer=initializer,
+        name='%s/W2' % self.name)
+
+    # Re-weight
+    inputs = tf.concat(inputs, axis=-1)
+    output = inputs * weights
+
+    # Fuse, add skip-connection
+    if self.config.use_skip_connection:
+      output += inputs
+
+    # Layer Normalization
+    if self.config.use_output_layer_norm:
+      output = layer_norm(output)
+    return output
+
+
+def _full_interaction(v_i, v_j):
+  # [bs, 1, dim] x [bs, dim, 1] = [bs, 1]
+  interaction = tf.matmul(
+      tf.expand_dims(v_i, axis=1), tf.expand_dims(v_j, axis=-1))
+  return tf.squeeze(interaction, axis=1)
+
+
+class BiLinear(tf.keras.layers.Layer):
+  """双线性特征交互层，支持不同field embeddings的size不等.
+
+  arxiv: 2209.05016
+
+  Attributes:
+    num_output_units: 输出的size
+    type: ['all', 'each', 'interaction']，支持其中一种
+    use_plus: 是否使用bi-linear+
+  """
+
+  def __init__(self, params, name='bilinear', **kwargs):
+    super(BiLinear, self).__init__(name, **kwargs)
+    params.check_required(['num_output_units'])
+    bilinear_plus = params.get_or_default('use_plus', True)
+    self.bilinear_type = params.get_or_default('type', 'interaction').lower()
+    self.output_size = params.num_output_units
+
+    if self.bilinear_type not in ['all', 'each', 'interaction']:
+      raise NotImplementedError(
+          "bilinear_type only support: ['all', 'each', 'interaction']")
+
+    if bilinear_plus:
+      self.func = _full_interaction
+    else:
+      self.func = tf.multiply
+
+  def call(self, inputs, **kwargs):
+    embeddings = inputs
+    logging.info('Bilinear Layer with %d inputs' % len(embeddings))
+    if len(embeddings) > 200:
+      logging.warning('There are too many inputs for bilinear layer: %d' %
+                      len(embeddings))
+    equal_dim = True
+    _dim = embeddings[0].shape[-1]
+    for emb in embeddings:
+      assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
+      if emb.shape[-1] != _dim:
+        equal_dim = False
+    if not equal_dim and self.bilinear_type != 'interaction':
+      raise ValueError(
+          'all embedding dimensions must be same when not use bilinear type: interaction'
+      )
+    dim = int(_dim)
+
+    field_size = len(embeddings)
+    initializer = tf.glorot_normal_initializer()
+
+    # bi-linear+: p的维度为[bs, f*(f-1)/2]
+    # bi-linear:
+    # 当equal_dim=True时，p的维度为[bs, f*(f-1)/2*k]，k为embeddings的size
+    # 当equal_dim=False时，p的维度为[bs, (k_2+k_3+...+k_f)+...+(k_i+k_{i+1}+...+k_f)+...+k_f]，
+    # 其中 k_i为第i个field的embedding的size
+    if self.bilinear_type == 'all':
+      v_dot = [
+          tf.layers.dense(
+              v_i,
+              dim,
+              kernel_initializer=initializer,
+              name='%s/all' % self.name,
+              reuse=tf.AUTO_REUSE) for v_i in embeddings[:-1]
+      ]
+      p = [
+          self.func(v_dot[i], embeddings[j])
+          for i, j in itertools.combinations(range(field_size), 2)
+      ]
+    elif self.bilinear_type == 'each':
+      v_dot = [
+          tf.layers.dense(
+              v_i,
+              dim,
+              kernel_initializer=initializer,
+              name='%s/each_%d' % (self.name, i),
+              reuse=tf.AUTO_REUSE) for i, v_i in enumerate(embeddings[:-1])
+      ]
+      p = [
+          self.func(v_dot[i], embeddings[j])
+          for i, j in itertools.combinations(range(field_size), 2)
+      ]
+    else:  # interaction
+      p = [
+          self.func(
+              tf.layers.dense(
+                  embeddings[i],
+                  embeddings[j].shape.as_list()[-1],
+                  kernel_initializer=initializer,
+                  name='%s/interaction_%d_%d' % (self.name, i, j),
+                  reuse=tf.AUTO_REUSE), embeddings[j])
+          for i, j in itertools.combinations(range(field_size), 2)
+      ]
+
+    output = tf.layers.dense(
+        tf.concat(p, axis=-1), self.output_size, kernel_initializer=initializer)
+    return output
+
+
+class FiBiNet(tf.keras.layers.Layer):
+  """FiBiNet++:Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction.
+
+  This is almost an exact implementation of the original FiBiNet++ model.
+  See the original paper:
+  https://arxiv.org/pdf/2209.05016.pdf
+  """
+
+  def __init__(self, params, name='fibinet', l2_reg=None, **kwargs):
+    super(FiBiNet, self).__init__(name, **kwargs)
+    self._config = params.get_pb_config()
+    if self._config.HasField('mlp'):
+      # self.final_dnn = dnn.DNN(
+      #   self._config.mlp,
+      #   kwargs['l2_reg'] if 'l2_reg' in kwargs else None,
+      #   name='%s_fibinet_mlp' % self.name,
+      #   is_training=False)
+      p = Parameter.make_from_pb(self._config.mlp)
+      self.final_dnn = MLP(p, name=name, l2_reg=l2_reg)
+    else:
+      self.final_dnn = None
+
+  def call(self, inputs, training=None, **kwargs):
+    feature_list = []
+
+    params = Parameter.make_from_pb(self._config.senet)
+    senet = SENet(params, name='%s_senet' % self.name)
+    senet_output = senet(inputs)
+    feature_list.append(senet_output)
+
+    if self._config.HasField('bilinear'):
+      params = Parameter.make_from_pb(self._config.bilinear)
+      bilinear = BiLinear(params, name='%s_bilinear' % self.name)
+      bilinear_output = bilinear(inputs)
+      feature_list.append(bilinear_output)
+
+    if len(feature_list) > 1:
+      feature = tf.concat(feature_list, axis=-1)
+    else:
+      feature = feature_list[0]
+
+    if self.final_dnn is not None:
+      feature = self.final_dnn(feature, training=training)
+    return feature
diff --git a/easy_rec/python/layers/keras/fm.py b/easy_rec/python/layers/keras/fm.py
new file mode 100644
index 000000000..56910541f
--- /dev/null
+++ b/easy_rec/python/layers/keras/fm.py
@@ -0,0 +1,46 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class FM(tf.keras.layers.Layer):
+  """Factorization Machine models pairwise (order-2) feature interactions without linear term and bias.
+
+  References
+    - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+  Input shape.
+    - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
+    - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
+  Output shape
+    - 2D tensor with shape: ``(batch_size, 1)``.
+  """
+
+  def __init__(self, params, name='fm', **kwargs):
+    super(FM, self).__init__(name, **kwargs)
+    self.use_variant = params.get_or_default('use_variant', False)
+
+  def call(self, inputs, **kwargs):
+    if type(inputs) == list:
+      emb_dims = set(map(lambda x: int(x.shape[-1]), inputs))
+      if len(emb_dims) != 1:
+        dims = ','.join([str(d) for d in emb_dims])
+        raise ValueError('all embedding dim must be equal in FM layer:' + dims)
+
+      with tf.name_scope(self.name):
+        fea = tf.stack(inputs, axis=1)
+    else:
+      assert inputs.shape.ndims == 3, 'input of FM layer must be a 3D tensor or a list of 2D tensors'
+      fea = inputs
+
+    with tf.name_scope(self.name):
+      square_of_sum = tf.square(tf.reduce_sum(fea, axis=1))
+      sum_of_square = tf.reduce_sum(tf.square(fea), axis=1)
+      cross_term = tf.subtract(square_of_sum, sum_of_square)
+      if self.use_variant:
+        cross_term = 0.5 * cross_term
+      else:
+        cross_term = 0.5 * tf.reduce_sum(cross_term, axis=-1, keepdims=True)
+    return cross_term
diff --git a/easy_rec/python/layers/keras/mask_net.py b/easy_rec/python/layers/keras/mask_net.py
new file mode 100644
index 000000000..8749a1ee8
--- /dev/null
+++ b/easy_rec/python/layers/keras/mask_net.py
@@ -0,0 +1,102 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+
+from easy_rec.python.layers.common_layers import layer_norm
+from easy_rec.python.layers.keras.blocks import MLP
+from easy_rec.python.layers.utils import Parameter
+
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
+
+class MaskBlock(tf.keras.layers.Layer):
+
+  def __init__(self, params, name='mask_block', reuse=None, **kwargs):
+    super(MaskBlock, self).__init__(name, **kwargs)
+    self.config = params.get_pb_config()
+    self.reuse = reuse
+
+  def call(self, inputs, **kwargs):
+    net, mask_input = inputs
+    mask_input_dim = int(mask_input.shape[-1])
+    if self.config.HasField('reduction_factor'):
+      aggregation_size = int(mask_input_dim * self.config.reduction_factor)
+    elif self.config.HasField('aggregation_size') is not None:
+      aggregation_size = self.config.aggregation_size
+    else:
+      raise ValueError(
+          'Need one of reduction factor or aggregation size for MaskBlock.')
+
+    if self.config.input_layer_norm:
+      input_name = net.name.replace(':', '_')
+      net = layer_norm(net, reuse=tf.AUTO_REUSE, name='ln_' + input_name)
+
+    # initializer = tf.initializers.variance_scaling()
+    initializer = tf.glorot_uniform_initializer()
+    mask = tf.layers.dense(
+        mask_input,
+        aggregation_size,
+        activation=tf.nn.relu,
+        kernel_initializer=initializer,
+        name='%s/hidden' % self.name,
+        reuse=self.reuse)
+    mask = tf.layers.dense(
+        mask, net.shape[-1], name='%s/mask' % self.name, reuse=self.reuse)
+    masked_net = net * mask
+
+    output_size = self.config.output_size
+    hidden = tf.layers.dense(
+        masked_net,
+        output_size,
+        use_bias=False,
+        name='%s/output' % self.name,
+        reuse=self.reuse)
+    ln_hidden = layer_norm(
+        hidden, name='%s/ln_output' % self.name, reuse=self.reuse)
+    return tf.nn.relu(ln_hidden)
+
+
+class MaskNet(tf.keras.layers.Layer):
+  """MaskNet: Introducing Feature-Wise Multiplication to CTR Ranking Models by Instance-Guided Mask.
+
+  Refer: https://arxiv.org/pdf/2102.07619.pdf
+  """
+
+  def __init__(self, params, name='mask_net', l2_reg=None, **kwargs):
+    super(MaskNet, self).__init__(name, **kwargs)
+    self.config = params.get_pb_config()
+    if self.config.HasField('mlp'):
+      p = Parameter.make_from_pb(self.config.mlp)
+      self.mlp = MLP(p, name='%s/mlp' % name, l2_reg=l2_reg)
+    else:
+      self.mlp = None
+
+  def call(self, inputs, training=None, **kwargs):
+    if self.config.use_parallel:
+      mask_outputs = []
+      for i, block_conf in enumerate(self.config.mask_blocks):
+        params = Parameter.make_from_pb(block_conf)
+        mask_layer = MaskBlock(
+            params, name='%s/block_%d' % (self.name, i), reuse=self.reuse)
+        mask_outputs.append(mask_layer((inputs, inputs)))
+      all_mask_outputs = tf.concat(mask_outputs, axis=1)
+
+      if self.mlp is not None:
+        output = self.mlp(all_mask_outputs)
+      else:
+        output = all_mask_outputs
+      return output
+    else:
+      net = inputs
+      for i, block_conf in enumerate(self.config.mask_blocks):
+        params = Parameter.make_from_pb(block_conf)
+        mask_layer = MaskBlock(
+            params, name='%s/block_%d' % (self.name, i), reuse=self.reuse)
+        net = mask_layer((net, inputs))
+
+      if self.mlp is not None:
+        output = self.mlp(net)
+      else:
+        output = net
+      return output
diff --git a/easy_rec/python/layers/numerical_embedding.py b/easy_rec/python/layers/keras/numerical_embedding.py
similarity index 64%
rename from easy_rec/python/layers/numerical_embedding.py
rename to easy_rec/python/layers/keras/numerical_embedding.py
index 6b571a3ad..4d6a16ca5 100644
--- a/easy_rec/python/layers/numerical_embedding.py
+++ b/easy_rec/python/layers/keras/numerical_embedding.py
@@ -77,88 +77,95 @@ def __call__(self, x, *args, **kwargs):
     return x
 
 
-class PeriodicEmbedding(object):
+class PeriodicEmbedding(tf.keras.layers.Layer):
   """Periodic embeddings for numerical features described in [1].
 
   References:
     * [1] Yury Gorishniy, Ivan Rubachev, Artem Babenko,
     "On Embeddings for Numerical Features in Tabular Deep Learning", 2022
     https://arxiv.org/pdf/2203.05556.pdf
-  """
 
-  def __init__(self, config, scope='periodic_embedding'):
-    """Init with a pb config.
+  Attributes:
+    embedding_dim: the embedding size, must be an even positive integer.
+    sigma: the scale of the weight initialization.
+      **This is a super important parameter which significantly affects performance**.
+      Its optimal value can be dramatically different for different datasets, so
+      no "default value" can exist for this parameter, and it must be tuned for
+      each dataset. In the original paper, during hyperparameter tuning, this
+      parameter was sampled from the distribution ``LogUniform[1e-2, 1e2]``.
+      A similar grid would be ``[1e-2, 1e-1, 1e0, 1e1, 1e2]``.
+      If possible, add more intermediate values to this grid.
+    output_3d_tensor: whether to output a 3d tensor
+    output_tensor_list: whether to output the list of embedding
+  """
 
-    Args:
-      config: pb config
-      config.embedding_dim: the embedding size, must be an even positive integer.
-      config.sigma: the scale of the weight initialization.
-        **This is a super important parameter which significantly affects performance**.
-        Its optimal value can be dramatically different for different datasets, so
-        no "default value" can exist for this parameter, and it must be tuned for
-        each dataset. In the original paper, during hyperparameter tuning, this
-        parameter was sampled from the distribution ``LogUniform[1e-2, 1e2]``.
-        A similar grid would be ``[1e-2, 1e-1, 1e0, 1e1, 1e2]``.
-        If possible, add more intermidiate values to this grid.
-      config.output_3d_tensor: whether to output a 3d tensor
-      scope: variable scope name
-    """
-    self.config = config
-    if config.embedding_dim % 2:
+  def __init__(self, params, name='periodic_embedding', **kwargs):
+    super(PeriodicEmbedding, self).__init__(name, **kwargs)
+    params.check_required(['embedding_dim', 'sigma'])
+    self.embedding_dim = int(params.embedding_dim)
+    if self.embedding_dim % 2:
       raise ValueError('embedding_dim must be even')
-    self.emb_dim = config.embedding_dim // 2
-    self.scope = scope
-    self.initializer = tf.random_normal_initializer(stddev=config.sigma)
-
-  def __call__(self, inputs, *args, **kwargs):
+    sigma = params.sigma
+    self.initializer = tf.random_normal_initializer(stddev=sigma)
+    self.add_linear_layer = params.get_or_default('add_linear_layer', True)
+    self.linear_activation = params.get_or_default('linear_activation', 'relu')
+    self.output_tensor_list = params.get_or_default('output_tensor_list', False)
+    self.output_3d_tensor = params.get_or_default('output_3d_tensor', False)
+
+  def call(self, inputs, **kwargs):
     if inputs.shape.ndims != 2:
       raise ValueError('inputs of PeriodicEmbedding must have 2 dimensions.')
 
     num_features = int(inputs.shape[-1])
-    with tf.variable_scope(self.scope):
+    emb_dim = self.embedding_dim // 2
+    with tf.variable_scope(self.name):
       c = tf.get_variable(
           'coefficients',
-          shape=[1, num_features, self.emb_dim],
+          shape=[1, num_features, emb_dim],
           initializer=self.initializer)
 
       features = inputs[..., None]  # [B, N, 1]
       v = 2 * math.pi * c * features  # [B, N, E]
       emb = tf.concat([tf.sin(v), tf.cos(v)], axis=-1)  # [B, N, 2E]
 
-      dim = self.config.embedding_dim
-      if self.config.add_linear_layer:
+      dim = self.embedding_dim
+      if self.add_linear_layer:
         linear = NLinear(num_features, dim, dim)
         emb = linear(emb)
-        act = get_activation(self.config.linear_activation)
+        act = get_activation(self.linear_activation)
         if callable(act):
           emb = act(emb)
       output = tf.reshape(emb, [-1, num_features * dim])
 
-      if self.config.output_tensor_list:
+      if self.output_tensor_list:
         return output, tf.unstack(emb, axis=1)
-      if self.config.output_3d_tensor:
+      if self.output_3d_tensor:
         return output, emb
       return output
 
 
-class AutoDisEmbedding(object):
+class AutoDisEmbedding(tf.keras.layers.Layer):
   """An Embedding Learning Framework for Numerical Features in CTR Prediction.
 
   Refer: https://arxiv.org/pdf/2012.08986v2.pdf
   """
 
-  def __init__(self, config, scope='auto_dis'):
-    self.config = config
-    self.emb_dim = config.embedding_dim
-    self.num_bins = config.num_bins
-    self.scope = scope
-
-  def __call__(self, inputs, *args, **kwargs):
+  def __init__(self, params, name='auto_dis_embedding', **kwargs):
+    super(AutoDisEmbedding, self).__init__(name, **kwargs)
+    params.check_required(['embedding_dim', 'num_bins', 'temperature'])
+    self.emb_dim = int(params.embedding_dim)
+    self.num_bins = int(params.num_bins)
+    self.temperature = params.temperature
+    self.keep_prob = params.get_or_default('keep_prob', 0.8)
+    self.output_tensor_list = params.get_or_default('output_tensor_list', False)
+    self.output_3d_tensor = params.get_or_default('output_3d_tensor', False)
+
+  def call(self, inputs, **kwargs):
     if inputs.shape.ndims != 2:
-      raise ValueError('inputs of PeriodicEmbedding must have 2 dimensions.')
+      raise ValueError('inputs of AutoDisEmbedding must have 2 dimensions.')
 
     num_features = int(inputs.shape[-1])
-    with tf.variable_scope(self.scope):
+    with tf.variable_scope(self.name):
       meta_emb = tf.get_variable(
           'meta_embedding',
           shape=[1, num_features, self.num_bins, self.emb_dim])
@@ -173,18 +180,17 @@ def __call__(self, inputs, *args, **kwargs):
       y = tf.squeeze(y, axis=3)  # [B, N, num_bin]
 
       # keep_prob(float): if dropout_flag is True, keep_prob rate to keep connect
-      alpha = self.config.keep_prob
+      alpha = self.keep_prob
       x_bar = y + alpha * hidden  # [B, N, num_bin]
-      t = self.config.temperature
-      x_hat = tf.nn.softmax(x_bar / t)  # [B, N, num_bin]
+      x_hat = tf.nn.softmax(x_bar / self.temperature)  # [B, N, num_bin]
 
       emb = tf.matmul(x_hat[:, :, None, :], meta_emb)  # [B, N, 1, D]
       emb = tf.squeeze(emb, axis=2)  # [B, N, D]
       output = tf.reshape(emb, [-1, self.emb_dim * num_features])  # [B, N*D]
 
-      if self.config.output_tensor_list:
+      if self.output_tensor_list:
         return output, tf.unstack(emb, axis=1)
 
-      if self.config.output_3d_tensor:
+      if self.output_3d_tensor:
         return output, emb
       return output
diff --git a/easy_rec/python/layers/mask_net.py b/easy_rec/python/layers/mask_net.py
deleted file mode 100644
index 2ec3f5799..000000000
--- a/easy_rec/python/layers/mask_net.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# -*- encoding:utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import tensorflow as tf
-
-from easy_rec.python.layers import dnn
-from easy_rec.python.layers.common_layers import layer_norm
-
-if tf.__version__ >= '2.0':
-  tf = tf.compat.v1
-
-
-class MaskBlock(object):
-
-  def __init__(self, mask_block_config, name='mask_block', reuse=None):
-    self.mask_block_config = mask_block_config
-    self.name = name
-    self.reuse = reuse
-
-  def __call__(self, net, mask_input):
-    mask_input_dim = int(mask_input.shape[-1])
-    if self.mask_block_config.HasField('reduction_factor'):
-      aggregation_size = int(mask_input_dim *
-                             self.mask_block_config.reduction_factor)
-    elif self.mask_block_config.HasField('aggregation_size') is not None:
-      aggregation_size = self.mask_block_config.aggregation_size
-    else:
-      raise ValueError(
-          'Need one of reduction factor or aggregation size for MaskBlock.')
-
-    if self.mask_block_config.input_layer_norm:
-      input_name = net.name.replace(':', '_')
-      net = layer_norm(net, reuse=tf.AUTO_REUSE, name='ln_' + input_name)
-
-    # initializer = tf.initializers.variance_scaling()
-    initializer = tf.glorot_uniform_initializer()
-    mask = tf.layers.dense(
-        mask_input,
-        aggregation_size,
-        activation=tf.nn.relu,
-        kernel_initializer=initializer,
-        name='%s/hidden' % self.name,
-        reuse=self.reuse)
-    mask = tf.layers.dense(
-        mask, net.shape[-1], name='%s/mask' % self.name, reuse=self.reuse)
-    masked_net = net * mask
-
-    output_size = self.mask_block_config.output_size
-    hidden = tf.layers.dense(
-        masked_net,
-        output_size,
-        use_bias=False,
-        name='%s/output' % self.name,
-        reuse=self.reuse)
-    ln_hidden = layer_norm(
-        hidden, name='%s/ln_output' % self.name, reuse=self.reuse)
-    return tf.nn.relu(ln_hidden)
-
-
-class MaskNet(object):
-
-  def __init__(self, mask_net_config, name='mask_net', reuse=None):
-    """MaskNet: Introducing Feature-Wise Multiplication to CTR Ranking Models by Instance-Guided Mask.
-
-    Refer: https://arxiv.org/pdf/2102.07619.pdf
-    """
-    self.mask_net_config = mask_net_config
-    self.name = name
-    self.reuse = reuse
-
-  def __call__(self, inputs, is_training, l2_reg=None):
-    conf = self.mask_net_config
-    if conf.use_parallel:
-      mask_outputs = []
-      for i, block_conf in enumerate(self.mask_net_config.mask_blocks):
-        mask_layer = MaskBlock(
-            block_conf, name='%s/block_%d' % (self.name, i), reuse=self.reuse)
-        mask_outputs.append(mask_layer(mask_input=inputs, net=inputs))
-      all_mask_outputs = tf.concat(mask_outputs, axis=1)
-
-      if conf.HasField('mlp'):
-        mlp = dnn.DNN(
-            conf.mlp,
-            l2_reg,
-            name='%s/mlp' % self.name,
-            is_training=is_training,
-            reuse=self.reuse)
-        output = mlp(all_mask_outputs)
-      else:
-        output = all_mask_outputs
-      return output
-    else:
-      net = inputs
-      for i, block_conf in enumerate(self.mask_net_config.mask_blocks):
-        mask_layer = MaskBlock(
-            block_conf, name='%s/block_%d' % (self.name, i), reuse=self.reuse)
-        net = mask_layer(net=net, mask_input=inputs)
-
-      if conf.HasField('mlp'):
-        mlp = dnn.DNN(
-            conf.mlp,
-            l2_reg,
-            name='%s/mlp' % self.name,
-            is_training=is_training,
-            reuse=self.reuse)
-        output = mlp(net)
-      else:
-        output = net
-      return output
diff --git a/easy_rec/python/layers/sequence_encoder.py b/easy_rec/python/layers/sequence_encoder.py
index 5286215d4..24dab9754 100644
--- a/easy_rec/python/layers/sequence_encoder.py
+++ b/easy_rec/python/layers/sequence_encoder.py
@@ -4,8 +4,8 @@
 
 import tensorflow as tf
 
-from easy_rec.python.layers.bst import BST
-from easy_rec.python.layers.din import DIN
+from easy_rec.python.layers.keras.bst import BST
+from easy_rec.python.layers.keras.din import DIN
 from easy_rec.python.protos.feature_config_pb2 import FeatureConfig
 
 if tf.__version__ >= '2.0':
diff --git a/easy_rec/python/layers/utils.py b/easy_rec/python/layers/utils.py
index 43204241c..1ba585e07 100644
--- a/easy_rec/python/layers/utils.py
+++ b/easy_rec/python/layers/utils.py
@@ -158,3 +158,60 @@ def mark_input_src(name, src_desc):
                             'name': name,
                             'src': src_desc
                         }))
+
+
+class Parameter(object):
+
+  def __init__(self, params, is_struct, l2_reg=None):
+    self.params = params
+    self.is_struct = is_struct
+    self._l2_reg = l2_reg
+
+  @staticmethod
+  def make_from_pb(config):
+    return Parameter(config, False)
+
+  def get_pb_config(self):
+    assert not self.is_struct, 'Struct parameter can not convert to pb config'
+    return self.params
+
+  @property
+  def l2_regularizer(self):
+    return self._l2_reg
+
+  @l2_regularizer.setter
+  def l2_regularizer(self, value):
+    self._l2_reg = value
+
+  def __getattr__(self, key):
+    if self.is_struct:
+      return self.params[key]
+    return getattr(self.params, key)
+
+  def __getitem__(self, key):
+    if self.is_struct:
+      return self.params[key]
+    return getattr(self.params, key)
+
+  def get_or_default(self, key, def_val):
+    if self.is_struct:
+      if key in self.params:
+        return self.params[key]
+      return def_val
+    else:  # pb message
+      return getattr(self.params, key)
+
+  def check_required(self, keys):
+    if not self.is_struct:
+      return
+    if not isinstance(keys, (list, tuple)):
+      keys = [keys]
+    for key in keys:
+      if key not in self.params:
+        raise KeyError('%s must be set in params')
+
+  def has_field(self, key):
+    if self.is_struct:
+      return key in self.params
+    else:
+      return self.params.HasField(key)
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index f1a3189f2..b114d0788 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -64,14 +64,13 @@ def __init__(self,
     if constant.SAMPLE_WEIGHT in features:
       self._sample_weight = features[constant.SAMPLE_WEIGHT]
 
-    self._sequence_encoder = SequenceEncoder(self._input_layer, feature_configs,
-                                             model_config.feature_groups,
-                                             self._l2_reg)
-    self._sequence_encoding_by_group_name = {}
+    # self._sequence_encoder = SequenceEncoder(self._input_layer, feature_configs,
+    #                                          model_config.feature_groups,
+    #                                          self._l2_reg)
+    # self._sequence_encoding_by_group_name = {}
     if model_config.HasField('backbone'):
       self._backbone = Backbone(
           model_config.backbone,
-          self,
           features,
           input_layer=self._input_layer,
           l2_reg=self._l2_reg)
@@ -85,7 +84,10 @@ def has_backbone(self):
   @property
   def backbone(self):
     if self._backbone:
-      return self._backbone(self._is_training)
+      output = self._backbone(self._is_training)
+      loss_dict = self._backbone.loss_dict
+      self._loss_dict.update(loss_dict)
+      return output
     return None
 
   @property
@@ -135,50 +137,50 @@ def build_input_layer(self, model_config, feature_configs):
         is_training=self._is_training,
         is_predicting=self._is_predicting)
 
-  def get_sequence_encoding(self, group_name=None, is_training=True):
-    if group_name is not None:
-      if group_name in self._sequence_encoding_by_group_name:
-        return self._sequence_encoding_by_group_name[group_name]
-      encoding = self._sequence_encoder(
-          self._feature_dict,
-          group_name,
-          is_training,
-          loss_dict=self._loss_dict)
-      self._sequence_encoding_by_group_name[group_name] = encoding
-      return encoding
-
-    seq_encoding = []
-    for group in self.feature_groups:
-      if len(group.sequence_encoders) == 0:
-        continue
-      group_name = group.group_name
-      if group_name in self._sequence_encoding_by_group_name:
-        encoding = self._sequence_encoding_by_group_name[group_name]
-      else:
-        encoding = self._sequence_encoder(
-            self._feature_dict,
-            group_name,
-            is_training,
-            loss_dict=self._loss_dict)
-        self._sequence_encoding_by_group_name[group_name] = encoding
-      if encoding is not None:
-        seq_encoding.append(encoding)
-
-    if len(seq_encoding) > 1:
-      encoding = tf.concat(seq_encoding, axis=-1)
-    elif len(seq_encoding) == 1:
-      encoding = seq_encoding[0]
-    else:
-      return None
-
-    # if self._base_model_config.HasField('sequence_dnn'):
-    #   sequence_dnn = dnn.DNN(
-    #       self._base_model_config.sequence_dnn,
-    #       self._l2_reg,
-    #       name='sequence_dnn',
-    #       is_training=self._is_training)
-    #   encoding = sequence_dnn(encoding)
-    return encoding
+  # def get_sequence_encoding(self, group_name=None, is_training=True):
+  #   if group_name is not None:
+  #     if group_name in self._sequence_encoding_by_group_name:
+  #       return self._sequence_encoding_by_group_name[group_name]
+  #     encoding = self._sequence_encoder(
+  #         self._feature_dict,
+  #         group_name,
+  #         is_training,
+  #         loss_dict=self._loss_dict)
+  #     self._sequence_encoding_by_group_name[group_name] = encoding
+  #     return encoding
+  #
+  #   seq_encoding = []
+  #   for group in self.feature_groups:
+  #     if len(group.sequence_encoders) == 0:
+  #       continue
+  #     group_name = group.group_name
+  #     if group_name in self._sequence_encoding_by_group_name:
+  #       encoding = self._sequence_encoding_by_group_name[group_name]
+  #     else:
+  #       encoding = self._sequence_encoder(
+  #           self._feature_dict,
+  #           group_name,
+  #           is_training,
+  #           loss_dict=self._loss_dict)
+  #       self._sequence_encoding_by_group_name[group_name] = encoding
+  #     if encoding is not None:
+  #       seq_encoding.append(encoding)
+  #
+  #   if len(seq_encoding) > 1:
+  #     encoding = tf.concat(seq_encoding, axis=-1)
+  #   elif len(seq_encoding) == 1:
+  #     encoding = seq_encoding[0]
+  #   else:
+  #     return None
+  #
+  #   # if self._base_model_config.HasField('sequence_dnn'):
+  #   #   sequence_dnn = dnn.DNN(
+  #   #       self._base_model_config.sequence_dnn,
+  #   #       self._l2_reg,
+  #   #       name='sequence_dnn',
+  #   #       is_training=self._is_training)
+  #   #   encoding = sequence_dnn(encoding)
+  #   return encoding
 
   @abstractmethod
   def build_predict_graph(self):
diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py
index 7d6b9e877..a5f447d86 100644
--- a/easy_rec/python/model/rank_model.py
+++ b/easy_rec/python/model/rank_model.py
@@ -35,17 +35,19 @@ def build_predict_graph(self):
           'method `build_predict_graph` must be implemented when backbone network do not exits'
       )
     output = self.backbone
-
-    model_config = getattr(self._base_model_config,
-                           self._base_model_config.WhichOneof('model'))
-    if hasattr(model_config, 'add_head_logits_layer') and \
-        model_config.HasField('add_head_logits_layer'):
-      add_head_logits_layer = model_config.add_head_logits_layer
-    else:
-      add_head_logits_layer = True
-    if add_head_logits_layer:
+    if int(output.shape[-1]) != self._num_class:
       logging.info('add head logits layer for rank model')
       output = tf.layers.dense(output, self._num_class, name='output')
+    # model_config = getattr(self._base_model_config,
+    #                        self._base_model_config.WhichOneof('model'))
+    # if hasattr(model_config, 'add_head_logits_layer') and \
+    #     model_config.HasField('add_head_logits_layer'):
+    #   add_head_logits_layer = model_config.add_head_logits_layer
+    # else:
+    #   add_head_logits_layer = True
+    # if add_head_logits_layer:
+    #   logging.info('add head logits layer for rank model')
+    #   output = tf.layers.dense(output, self._num_class, name='output')
 
     self._add_to_prediction_dict(output)
     return self._prediction_dict
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
index b37b14b2c..a11944d95 100644
--- a/easy_rec/python/protos/backbone.proto
+++ b/easy_rec/python/protos/backbone.proto
@@ -2,14 +2,17 @@ syntax = "proto2";
 package protos;
 
 import "easy_rec/python/protos/dnn.proto";
-import "easy_rec/python/protos/fm.proto";
-import "easy_rec/python/protos/layer.proto";
-import "easy_rec/python/protos/fibinet.proto";
-import "easy_rec/python/protos/masknet.proto";
-
-
-message SequenceLayer {
-    optional MLP mlp = 1;
+import "easy_rec/python/protos/keras_layer.proto";
+
+message InputLayer {
+    optional bool do_batch_norm = 1;
+    optional bool do_layer_norm = 2;
+    optional float dropout_rate = 3;
+    optional float feature_dropout_rate = 4;
+    optional bool only_output_feature_list = 5;
+    optional bool only_output_3d_tensor = 6;
+    optional bool output_2d_tensor_and_feature_list = 7;
+    optional bool output_seq_and_normal_feature = 8;
 }
 
 message Lambda {
@@ -21,34 +24,18 @@ message Input {
     optional string input_fn = 2;
 }
 
-message KerasLayer {
-    required string class_name = 1;
-    optional Any params = 2;
-}
-
 message Block {
     required string name = 1;
     // the input names of feature groups or other blocks
     repeated Input inputs = 2;
     optional int32 input_concat_axis = 3 [default = -1];
-    optional string extra_input_fn = 4;
+    optional bool merge_inputs_into_list = 4;
+    optional string extra_input_fn = 5;
     oneof layer {
-        Lambda Lambda = 100;
         InputLayer input_layer = 101;
-        MLP mlp = 102;
-        PeriodicEmbedding periodic_embedding = 103;
-        AutoDisEmbedding auto_dis_embedding = 104;
-        SequenceLayer sequence_encoder = 105;
-        HighWayTower highway = 106;
-        MaskNet masknet = 107;
-        SENet senet = 108;
-        FiBiNetTower fibinet = 109;
-        FM fm = 110;
-        // Concatenate concat = 111;
-        // Reshape reshape = 112;
-        Add add = 113;
-        Dot dot = 114;
-        //OpChain chain = 116;
+        Lambda lambda = 102;
+        KerasLayer keras_layer = 103;
+        Sequential sequential = 104;
     }
 }
 
@@ -58,25 +45,13 @@ message BackboneTower {
     optional MLP top_mlp = 3;
 }
 
-//message Operator {
-//    oneof Op {
-//        MLP mlp = 102;
-//        PeriodicEmbedding periodic_embedding = 103;
-//        AutoDisEmbedding auto_dis_embedding = 104;
-//        HighWayTower highway = 106;
-//        MaskNet masknet = 107;
-//        SENet senet = 108;
-//        FiBiNetTower fibinet = 109;
-//        FM fm = 110;
-//        Concatenate concat = 111;
-//        Reshape reshape = 112;
-//        Add add = 113;
-//        Dot dot = 114;
-//        Lambda Lambda = 115;
-//        OpChain chain = 116;
-//    }
-//}
-//
-//message OpChain {
-//    repeated Operator ops = 1;
-//}
+message Layer {
+    oneof layer {
+        Lambda lambda = 101;
+        KerasLayer keras_layer = 102;
+    }
+}
+
+message Sequential {
+    repeated Layer layers = 1;
+}
diff --git a/easy_rec/python/protos/dnn.proto b/easy_rec/python/protos/dnn.proto
index 00fe79d82..ff40f0fe4 100644
--- a/easy_rec/python/protos/dnn.proto
+++ b/easy_rec/python/protos/dnn.proto
@@ -19,9 +19,13 @@ message MLP {
     // ratio of dropout
     repeated float dropout_ratio = 2;
     // activation function
-    optional string activation = 3 [default = 'tf.nn.relu'];
+    optional string activation = 3 [default = 'relu'];
     // use batch normalization
     optional bool use_bn = 4 [default = true];
-    optional bool last_layer_no_activation = 5 [default = false];
-    optional bool last_layer_no_batch_norm = 6 [default = false];
+    optional bool use_final_bn = 5 [default = true];
+    optional string final_activation = 6 [default = 'relu'];
+    optional bool use_bias = 7 [default = true];
+    // kernel_initializer
+    optional string initializer = 8 [default = 'he_uniform'];
+    optional bool use_bn_after_activation = 9;
 }
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 48c6f4f8d..2bb801847 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -29,11 +29,12 @@ import "easy_rec/python/protos/multi_tower_recall.proto";
 // for input performance test
 message DummyModel {
 }
+
 // configure backbone network in a free style way
 message RankModel {
   optional float l2_regularization = 1;
-  optional bool add_head_logits_layer = 2 [default=true];
-  optional uint32 wide_output_dim = 3;
+  optional uint32 wide_output_dim = 2;
+  // optional bool add_head_logits_layer = 3 [default=true];
 }
 
 // for knowledge distillation
diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto
index e8b3b2c4f..ee245b0e9 100644
--- a/easy_rec/python/protos/feature_config.proto
+++ b/easy_rec/python/protos/feature_config.proto
@@ -145,7 +145,7 @@ message FeatureGroupConfig {
     optional WideOrDeep wide_deep = 3 [default = DEEP];
     repeated SeqAttGroupConfig sequence_features = 4;
     optional bool negative_sampler = 5 [default = false];
-    repeated SequenceEncoder sequence_encoders = 6;
+    // repeated SequenceEncoder sequence_encoders = 6;
 }
 
 message SeqAttMap {
diff --git a/easy_rec/python/protos/fibinet.proto b/easy_rec/python/protos/fibinet.proto
deleted file mode 100644
index 1d48448eb..000000000
--- a/easy_rec/python/protos/fibinet.proto
+++ /dev/null
@@ -1,23 +0,0 @@
-syntax = "proto2";
-package protos;
-
-import "easy_rec/python/protos/dnn.proto";
-
-message SENet {
-    required uint32 reduction_ratio = 1 [default = 4];
-    optional uint32 num_squeeze_group = 2 [default = 2];
-    optional bool use_skip_connection = 3 [default = true];
-    optional bool use_output_layer_norm = 4 [default = true];
-}
-
-message Bilinear {
-    required string type = 1 [default = 'interaction'];
-    required bool use_plus = 2 [default = true];
-    required uint32 num_output_units = 3;
-}
-
-message FiBiNetTower {
-    optional Bilinear bilinear = 1;
-    required SENet senet = 2;
-    optional DNN mlp = 8;
-}
diff --git a/easy_rec/python/protos/keras_layer.proto b/easy_rec/python/protos/keras_layer.proto
new file mode 100644
index 000000000..94a3ba801
--- /dev/null
+++ b/easy_rec/python/protos/keras_layer.proto
@@ -0,0 +1,26 @@
+syntax = "proto2";
+package protos;
+
+import "google/protobuf/struct.proto";
+import "easy_rec/python/protos/layer.proto";
+import "easy_rec/python/protos/dnn.proto";
+import "easy_rec/python/protos/fm.proto";
+import "easy_rec/python/protos/seq_encoder.proto";
+
+message KerasLayer {
+    required string class_name = 1;
+    oneof params {
+        google.protobuf.Struct st_params = 2;
+        PeriodicEmbedding periodic_embedding = 3;
+        AutoDisEmbedding auto_dis_embedding = 4;
+        FM fm = 5;
+        MaskBlock mask_block = 6;
+        MaskNet masknet = 7;
+        SENet senet = 8;
+        Bilinear bilinear = 9;
+        FiBiNet fibinet = 10;
+        MLP mlp = 11;
+        DINEncoder din = 12;
+        BSTEncoder bst = 13;
+    }
+}
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index e7ad65460..9a1e40acb 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -3,16 +3,6 @@ package protos;
 
 import "easy_rec/python/protos/dnn.proto";
 
-message InputLayer {
-    optional bool do_batch_norm = 1;
-    optional bool do_layer_norm = 2;
-    optional float dropout_rate = 3;
-    optional float feature_dropout_rate = 4;
-    optional bool only_output_feature_list = 5;
-    optional bool only_output_3d_tensor = 6;
-    optional bool output_2d_tensor_and_feature_list = 7;
-}
-
 message HighWayTower {
     optional string input = 1;
     required uint32 emb_size = 2;
@@ -38,18 +28,34 @@ message AutoDisEmbedding {
     optional bool output_tensor_list = 6;
 }
 
-message Concatenate {
-    required int32 axis = 1;
-    optional int32 expand_dim_before = 2;
-    optional int32 expand_dim_after = 3;
+message SENet {
+    required uint32 reduction_ratio = 1 [default = 4];
+    optional uint32 num_squeeze_group = 2 [default = 2];
+    optional bool use_skip_connection = 3 [default = true];
+    optional bool use_output_layer_norm = 4 [default = true];
+}
+
+message Bilinear {
+    required string type = 1 [default = 'interaction'];
+    required bool use_plus = 2 [default = true];
+    required uint32 num_output_units = 3;
 }
 
-message Reshape {
-    repeated int32 dims = 1;
+message FiBiNet {
+    optional Bilinear bilinear = 1;
+    required SENet senet = 2;
+    optional MLP mlp = 8;
 }
 
-message Add {
+message MaskBlock {
+    optional float reduction_factor = 1;
+    required uint32 output_size = 2;
+    optional uint32 aggregation_size = 3;
+    optional bool input_layer_norm = 4 [default = true];
 }
 
-message Dot {
+message MaskNet {
+    repeated MaskBlock mask_blocks = 1;
+    required bool use_parallel = 2 [default = true];
+    optional MLP mlp = 3;
 }
diff --git a/easy_rec/python/protos/masknet.proto b/easy_rec/python/protos/masknet.proto
deleted file mode 100644
index 3feba334e..000000000
--- a/easy_rec/python/protos/masknet.proto
+++ /dev/null
@@ -1,17 +0,0 @@
-syntax = "proto2";
-package protos;
-
-import "easy_rec/python/protos/dnn.proto";
-
-message MaskBlock {
-    optional float reduction_factor = 1;
-    required uint32 output_size = 2;
-    optional uint32 aggregation_size = 3;
-    optional bool input_layer_norm = 4 [default = true];
-}
-
-message MaskNet {
-    repeated MaskBlock mask_blocks = 1;
-    required bool use_parallel = 2 [default = true];
-    optional DNN mlp = 3;
-}
diff --git a/easy_rec/python/utils/load_class.py b/easy_rec/python/utils/load_class.py
index efd2cc9cb..0cf12c26f 100644
--- a/easy_rec/python/utils/load_class.py
+++ b/easy_rec/python/utils/load_class.py
@@ -229,7 +229,7 @@ def load_keras_layer(name):
     name: keras layer name
 
   Return:
-    modules or functions or classes
+    (layer_class, is_customize)
   """
   name = name.strip()
   if name == '' or name is None:
@@ -237,13 +237,13 @@ def load_keras_layer(name):
 
   path = 'easy_rec.python.layers.keras.' + name
   try:
-    return pydoc.locate(path)
-  except pydoc.ErrorDuringImport:
+    cls = pydoc.locate(path)
+    if cls is not None:
+      return cls, True
     path = 'tensorflow.keras.layers.' + name
-    try:
-      return pydoc.locate(path)
-    except pydoc.ErrorDuringImport:
-      print('load keras layer %s failed' % name)
-      logging.error('load keras layer %s failed: %s' %
-                    (name, traceback.format_exc()))
-      return None
+    return pydoc.locate(path), False
+  except pydoc.ErrorDuringImport:
+    print('load keras layer %s failed' % name)
+    logging.error('load keras layer %s failed: %s' %
+                   (name, traceback.format_exc()))
+    return None, False
diff --git a/easy_rec/python/utils/tf_utils.py b/easy_rec/python/utils/tf_utils.py
index e4d39c012..160a2f67a 100644
--- a/easy_rec/python/utils/tf_utils.py
+++ b/easy_rec/python/utils/tf_utils.py
@@ -48,37 +48,36 @@ def get_config_type(tf_type):
   return type_map[tf_type]
 
 
-def add_op(inputs):
-  if not isinstance(inputs, list):
-    return inputs
-  if len(inputs) == 1:
-    if isinstance(inputs[0], list):
-      return tf.keras.layers.Add()(inputs[0])
-    return inputs[0]
-  return tf.keras.layers.Add()(inputs)
+# def add_op(inputs):
+#   if not isinstance(inputs, list):
+#     return inputs
+#   if len(inputs) == 1:
+#     if isinstance(inputs[0], list):
+#       return tf.keras.layers.Add()(inputs[0])
+#     return inputs[0]
+#   return tf.keras.layers.Add()(inputs)
 
-
-def dot_op(features):
-  """Compute inner dot between any two pair tensors.
-
-  Args:
-    features: must be one of
-    - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
-    - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
-  Return:
-    - 2D tensor with shape: ``(batch_size, 1)``.
-  """
-  if isinstance(features, (list, tuple)):
-    features = tf.stack(features, axis=1)
-  assert features.shape.ndims == 3, 'input of dot func must be a 3D tensor or a list of 2D tensors'
-
-  batch_size = tf.shape(features)[0]
-  matrixdot = tf.matmul(features, features, transpose_b=True)
-  feature_dim = matrixdot.shape[-1]
-
-  ones_mat = tf.ones_like(matrixdot)
-  lower_tri_mat = ones_mat - tf.linalg.band_part(ones_mat, 0, -1)
-  lower_tri_mask = tf.cast(lower_tri_mat, tf.bool)
-  result = tf.boolean_mask(matrixdot, lower_tri_mask)
-  output_dim = feature_dim * (feature_dim - 1) // 2
-  return tf.reshape(result, (batch_size, output_dim))
+# def dot_op(features):
+#   """Compute inner dot between any two pair tensors.
+#
+#   Args:
+#     features: must be one of
+#     - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
+#     - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
+#   Return:
+#     - 2D tensor with shape: ``(batch_size, 1)``.
+#   """
+#   if isinstance(features, (list, tuple)):
+#     features = tf.stack(features, axis=1)
+#   assert features.shape.ndims == 3, 'input of dot func must be a 3D tensor or a list of 2D tensors'
+#
+#   batch_size = tf.shape(features)[0]
+#   matrixdot = tf.matmul(features, features, transpose_b=True)
+#   feature_dim = matrixdot.shape[-1]
+#
+#   ones_mat = tf.ones_like(matrixdot)
+#   lower_tri_mat = ones_mat - tf.linalg.band_part(ones_mat, 0, -1)
+#   lower_tri_mask = tf.cast(lower_tri_mat, tf.bool)
+#   result = tf.boolean_mask(matrixdot, lower_tri_mask)
+#   output_dim = feature_dim * (feature_dim - 1) // 2
+#   return tf.reshape(result, (batch_size, output_dim))
diff --git a/examples/configs/deepfm_backbone_on_criteo.config b/examples/configs/deepfm_backbone_on_criteo.config
index 467d8ad55..9cba3fb82 100644
--- a/examples/configs/deepfm_backbone_on_criteo.config
+++ b/examples/configs/deepfm_backbone_on_criteo.config
@@ -1,6 +1,6 @@
 train_input_path: "examples/data/criteo/criteo_train_data"
 eval_input_path: "examples/data/criteo/criteo_test_data"
-model_dir: "examples/ckpt/deepfm_backbone_criteo_w"
+model_dir: "examples/ckpt/deepfm_backbone_criteo"
 
 train_config {
   log_step_count_steps: 500
@@ -574,17 +574,12 @@ model_config: {
     wide_deep:WIDE
   }
   backbone {
-    blocks {
-      name: 'wide_features'
-      input_layer {
-      }
-    }
     blocks {
       name: 'wide_logit'
       inputs {
         name: 'wide_features'
       }
-      Lambda {
+      lambda {
         expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
       }
     }
@@ -600,8 +595,14 @@ model_config: {
         name: 'deep_features'
         input_fn: 'lambda x: x[1]'
       }
-      fm {
-        use_variant: true
+      keras_layer {
+        class_name: 'FM'
+        st_params {
+          fields {
+            key: 'use_variant'
+            value { bool_value: true }
+          }
+        }
       }
     }
     blocks {
@@ -610,8 +611,11 @@ model_config: {
         name: 'deep_features'
         input_fn: 'lambda x: x[0]'
       }
-      mlp {
-        hidden_units: [256, 128, 64]
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64]
+        }
       }
     }
     concat_blocks: ['wide_logit', 'fm', 'deep']
diff --git a/examples/configs/deepfm_backbone_on_movielens.config b/examples/configs/deepfm_backbone_on_movielens.config
index 46a79d83b..c6bf82151 100644
--- a/examples/configs/deepfm_backbone_on_movielens.config
+++ b/examples/configs/deepfm_backbone_on_movielens.config
@@ -1,6 +1,6 @@
 train_input_path: "examples/data/movielens_1m/movies_train_data"
 eval_input_path: "examples/data/movielens_1m/movies_test_data"
-model_dir: "examples/ckpt/deepfm_backbone_movieslen_ckpt"
+model_dir: "examples/ckpt/deepfm_backbone_movieslen"
 
 train_config {
   log_step_count_steps: 100
@@ -17,9 +17,8 @@ train_config {
     }
     use_moving_average: false
   }
-  save_checkpoints_steps: 100
+  save_checkpoints_steps: 2000
   sync_replicas: True
-  num_steps: 2500
 }
 
 eval_config {
@@ -150,6 +149,17 @@ feature_config: {
 }
 model_config: {
   model_class: 'RankModel'
+  feature_groups: {
+    group_name: 'wide'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: WIDE
+  }
   feature_groups: {
     group_name: 'features'
     feature_names: 'user_id'
@@ -164,28 +174,66 @@ model_config: {
   }
   backbone {
     blocks {
-      name: 'emb_list'
-      inputs: 'features'
+      name: 'wide_logit'
+      inputs {
+        name: 'wide'
+      }
+      lambda {
+        expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
+      }
+    }
+    blocks {
+      name: 'features'
       input_layer {
-        output_feature_list: true
+        output_2d_tensor_and_feature_list: true
       }
     }
     blocks {
       name: 'fm'
-      inputs: 'emb_list'
-      fm {}
+      inputs {
+        name: 'features'
+        input_fn: 'lambda x: x[1]'
+      }
+      keras_layer {
+        class_name: 'FM'
+      }
     }
     blocks {
       name: 'deep'
-      inputs: 'features'
-      mlp {
-        hidden_units: [256, 128, 64]
+      inputs {
+        name: 'features'
+        input_fn: 'lambda x: x[0]'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64, 1]
+          use_final_bn: false
+          final_activation: 'linear'
+        }
+      }
+    }
+    blocks {
+      name: 'add'
+      inputs {
+        name: 'wide_logit'
+      }
+      inputs {
+        name: 'fm'
+      }
+      inputs {
+        name: 'deep'
+      }
+      merge_inputs_into_list: true
+      keras_layer {
+        class_name: 'Add'
       }
     }
-    concat_blocks: ['fm', 'deep']
+    concat_blocks: 'add'
   }
   rank_model {
     l2_regularization: 1e-4
+    wide_output_dim: 1
   }
   embedding_regularization: 1e-4
 }
diff --git a/examples/configs/deepfm_on_movielens.config b/examples/configs/deepfm_on_movielens.config
index cab092c20..0468ae12f 100644
--- a/examples/configs/deepfm_on_movielens.config
+++ b/examples/configs/deepfm_on_movielens.config
@@ -137,7 +137,7 @@ feature_config: {
     sequence_combiner: {
       text_cnn: {
         filter_sizes: [2, 3, 4]
-        num_filters: [16, 8, 8]
+        num_filters: [8, 4, 4]
       }
     }
   }
diff --git a/examples/configs/dlrm_backbone_on_criteo.config b/examples/configs/dlrm_backbone_on_criteo.config
index e87acef39..afdc0f784 100644
--- a/examples/configs/dlrm_backbone_on_criteo.config
+++ b/examples/configs/dlrm_backbone_on_criteo.config
@@ -528,8 +528,11 @@ model_config: {
       inputs {
         name: 'dense'
       }
-      mlp {
-        hidden_units: [64, 32, 16]
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [64, 32, 16]
+        }
       }
     }
     blocks {
@@ -548,7 +551,9 @@ model_config: {
         name: 'sparse'
         input_fn: 'lambda x: x[1]'
       }
-      dot { }
+      keras_layer {
+        class_name: 'DotInteraction'
+      }
     }
     blocks {
       name: 'sparse_2d'
diff --git a/examples/configs/dlrm_on_criteo_with_autodis.config b/examples/configs/dlrm_on_criteo_with_autodis.config
index eb81e0a05..151bb4424 100644
--- a/examples/configs/dlrm_on_criteo_with_autodis.config
+++ b/examples/configs/dlrm_on_criteo_with_autodis.config
@@ -527,11 +527,14 @@ model_config: {
       inputs {
         name: 'dense'
       }
-      auto_dis_embedding {
-        embedding_dim: 16
-        num_bins: 20
-        temperature: 0.815
-        output_tensor_list: true
+      keras_layer {
+        class_name: 'AutoDisEmbedding'
+        auto_dis_embedding {
+          embedding_dim: 16
+          num_bins: 40
+          temperature: 0.815
+          output_tensor_list: true
+        }
       }
     }
     blocks {
@@ -550,7 +553,9 @@ model_config: {
         name: 'sparse'
         input_fn: 'lambda x: x[1]'
       }
-      dot { }
+      keras_layer {
+        class_name: 'DotInteraction'
+      }
     }
     blocks {
       name: 'sparse_2d'
diff --git a/examples/configs/dlrm_on_criteo_with_periodic.config b/examples/configs/dlrm_on_criteo_with_periodic.config
new file mode 100644
index 000000000..81d0db1b3
--- /dev/null
+++ b/examples/configs/dlrm_on_criteo_with_periodic.config
@@ -0,0 +1,591 @@
+train_input_path: "examples/data/criteo/criteo_train_data"
+eval_input_path: "examples/data/criteo/criteo_test_data"
+model_dir: "examples/ckpt/dlrm_periodic_criteo"
+
+train_config {
+  log_step_count_steps: 500
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 20000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+}
+
+data_config {
+  separator: "\t"
+  input_fields: {
+    input_name: "label"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F1"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F2"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F3"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F4"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F5"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F6"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F7"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F8"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F9"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F10"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F11"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F12"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "F13"
+    input_type: FLOAT
+    default_val:"0"
+  }
+  input_fields: {
+    input_name: "C1"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C2"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C3"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C4"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C5"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C6"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C7"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C8"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C9"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C10"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C11"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C12"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C13"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C14"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C15"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C16"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C17"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C18"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C19"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C20"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C21"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C22"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C23"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C24"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C25"
+    input_type: STRING
+    default_val:""
+  }
+  input_fields: {
+    input_name: "C26"
+    input_type: STRING
+    default_val:""
+  }
+  label_fields: "label"
+
+  batch_size: 4096
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+}
+
+feature_config: {
+  features: {
+    input_names: "F1"
+    feature_type: RawFeature
+    min_val:0.0
+    max_val: 5775.0
+  }
+  features: {
+    input_names: "F2"
+    feature_type: RawFeature
+    min_val: -3.0
+    max_val: 257675.0
+  }
+  features: {
+    input_names: "F3"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 65535.0
+  }
+  features: {
+    input_names: "F4"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 969.0
+  }
+  features: {
+    input_names: "F5"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 23159456.0
+  }
+  features: {
+    input_names: "F6"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 431037.0
+  }
+  features: {
+    input_names: "F7"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 56311.0
+  }
+  features: {
+    input_names: "F8"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 6047.0
+  }
+  features: {
+    input_names: "F9"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 29019.0
+  }
+  features: {
+    input_names: "F10"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 46.0
+  }
+  features: {
+    input_names: "F11"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 231.0
+  }
+  features: {
+    input_names: "F12"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 4008.0
+  }
+  features: {
+    input_names: "F13"
+    feature_type: RawFeature
+    min_val: 0.0
+    max_val: 7393.0
+  }
+  features: {
+    input_names: "C1"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C2"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C3"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C4"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C5"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C6"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C7"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C8"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C9"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C10"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C11"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C12"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C13"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C14"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C15"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C16"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C17"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C18"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C19"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C20"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C21"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C22"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C23"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C24"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }features: {
+    input_names: "C25"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+  features: {
+    input_names: "C26"
+    hash_bucket_size: 1000000
+    feature_type: IdFeature
+    embedding_dim: 16
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: "dense"
+    feature_names: "F1"
+    feature_names: "F2"
+    feature_names: "F3"
+    feature_names: "F4"
+    feature_names: "F5"
+    feature_names: "F6"
+    feature_names: "F7"
+    feature_names: "F8"
+    feature_names: "F9"
+    feature_names: "F10"
+    feature_names: "F11"
+    feature_names: "F12"
+    feature_names: "F13"
+    wide_deep:DEEP
+  }
+  feature_groups: {
+    group_name: "sparse"
+    feature_names: "C1"
+    feature_names: "C2"
+    feature_names: "C3"
+    feature_names: "C4"
+    feature_names: "C5"
+    feature_names: "C6"
+    feature_names: "C7"
+    feature_names: "C8"
+    feature_names: "C9"
+    feature_names: "C10"
+    feature_names: "C11"
+    feature_names: "C12"
+    feature_names: "C13"
+    feature_names: "C14"
+    feature_names: "C15"
+    feature_names: "C16"
+    feature_names: "C17"
+    feature_names: "C18"
+    feature_names: "C19"
+    feature_names: "C20"
+    feature_names: "C21"
+    feature_names: "C22"
+    feature_names: "C23"
+    feature_names: "C24"
+    feature_names: "C25"
+    feature_names: "C26"
+    wide_deep:DEEP
+  }
+  backbone {
+    blocks {
+      name: 'num_emb'
+      inputs {
+        name: 'dense'
+      }
+      keras_layer {
+        class_name: 'PeriodicEmbedding'
+        st_params {
+          fields {
+            key: "output_tensor_list"
+            value { bool_value: true }
+          }
+          fields {
+            key: "embedding_dim"
+            value { number_value: 16 }
+          }
+          fields {
+            key: "sigma"
+            value { number_value: 0.005 }
+          }
+        }
+      }
+    }
+    blocks {
+      name: 'sparse'
+      input_layer {
+        output_2d_tensor_and_feature_list: true
+      }
+    }
+    blocks {
+      name: 'dot'
+      inputs {
+        name: 'num_emb'
+        input_fn: 'lambda x: x[1]'
+      }
+      inputs {
+        name: 'sparse'
+        input_fn: 'lambda x: x[1]'
+      }
+      keras_layer {
+        class_name: 'DotInteraction'
+      }
+    }
+    blocks {
+      name: 'sparse_2d'
+      inputs {
+        name: 'sparse'
+        input_fn: 'lambda x: x[0]'
+      }
+    }
+    blocks {
+      name: 'num_emb_2d'
+      inputs {
+        name: 'num_emb'
+        input_fn: 'lambda x: x[0]'
+      }
+    }
+    concat_blocks: ['num_emb_2d', 'dot', 'sparse_2d']
+    top_mlp {
+      hidden_units: [256, 128, 64]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-5
+  }
+  embedding_regularization: 1e-5
+}
diff --git a/examples/configs/dlrm_standard_on_criteo.config b/examples/configs/dlrm_standard_on_criteo.config
index 131a94607..03e3df7bc 100644
--- a/examples/configs/dlrm_standard_on_criteo.config
+++ b/examples/configs/dlrm_standard_on_criteo.config
@@ -527,8 +527,11 @@ model_config: {
       inputs {
         name: 'dense'
       }
-      mlp {
-        hidden_units: [64, 32, 16]
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [64, 32, 16]
+        }
       }
     }
     blocks {
@@ -546,7 +549,9 @@ model_config: {
       inputs {
         name: 'sparse'
       }
-      dot { }
+      keras_layer {
+        class_name: 'DotInteraction'
+      }
     }
     concat_blocks: ['bottom_mlp', 'dot']
     top_mlp {
diff --git a/examples/configs/fibinet_on_movielens.config b/examples/configs/fibinet_on_movielens.config
index 8508172c6..aa6bef7f0 100644
--- a/examples/configs/fibinet_on_movielens.config
+++ b/examples/configs/fibinet_on_movielens.config
@@ -17,9 +17,8 @@ train_config {
     }
     use_moving_average: false
   }
-  save_checkpoints_steps: 100
-  sync_replicas: True
-  num_steps: 2500
+  save_checkpoints_steps: 2000
+  sync_replicas: False
 }
 
 eval_config {
@@ -163,26 +162,30 @@ model_config: {
   }
   backbone {
     blocks {
-      name: "emb_list"
-      inputs: "all"
+      name: "all"
       input_layer {
         do_batch_norm: true
-        output_feature_list: true
+        only_output_feature_list: true
       }
     }
     blocks {
       name: "fibinet"
-      inputs: "emb_list"
-      fibinet {
-        senet {
-          reduction_ratio: 4
-        }
-        bilinear {
-          type: 'each'
-          num_output_units: 512
-        }
-        mlp {
-          hidden_units: [512, 256]
+      inputs {
+        name: "all"
+      }
+      keras_layer {
+        class_name: 'FiBiNet'
+        fibinet {
+          senet {
+            reduction_ratio: 4
+          }
+          bilinear {
+            type: 'each'
+            num_output_units: 512
+          }
+          mlp {
+            hidden_units: [512, 256]
+          }
         }
       }
     }
diff --git a/examples/configs/masknet_on_movielens.config b/examples/configs/masknet_on_movielens.config
index 4c7f507b9..c98e3fbd0 100644
--- a/examples/configs/masknet_on_movielens.config
+++ b/examples/configs/masknet_on_movielens.config
@@ -164,22 +164,27 @@ model_config: {
   backbone {
     blocks {
       name: "mask_net"
-      inputs: "all"
-      masknet {
-        mask_blocks {
-          aggregation_size: 512
-          output_size: 256
-        }
-        mask_blocks {
-          aggregation_size: 512
-          output_size: 256
-        }
-        mask_blocks {
-          aggregation_size: 512
-          output_size: 256
-        }
-        mlp {
-          hidden_units: [512, 256]
+      inputs {
+        name: "all"
+      }
+      keras_layer {
+        class_name: 'MaskNet'
+        masknet {
+          mask_blocks {
+            aggregation_size: 512
+            output_size: 256
+          }
+          mask_blocks {
+            aggregation_size: 512
+            output_size: 256
+          }
+          mask_blocks {
+            aggregation_size: 512
+            output_size: 256
+          }
+          mlp {
+            hidden_units: [512, 256]
+          }
         }
       }
     }
diff --git a/examples/readme.md b/examples/readme.md
index 94643541e..d33304faf 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -212,8 +212,8 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
   | Model            | Epoch | AUC    |
   | ---------------- | ----- | ------ |
   | Wide&Deep        | 1     | 0.8558 |
-  | DeepFM           | 1     | 0.8688 |
-  | DeepFM(Backbone) | 1     | 0.8876 |
+  | DeepFM           | 1     | 0.8867 |
+  | DeepFM(Backbone) | 1     | 0.8872 |
   | DCN              | 1     | 0.8576 |
   | AutoInt          | 1     | 0.8513 |
   | MaskNet          | 1     | 0.8872 |
@@ -221,17 +221,18 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
 
 - Criteo-Research
 
-  | Model             | Epoch | AUC    |
-  | ----------------- | ----- | ------ |
-  | FM                | 1     | 0.7577 |
-  | DeepFM            | 1     | 0.7970 |
-  | DeepFM (backbone) | 1     | 0.7970 |
-  | DeepFM (periodic) | 1     | 0.7980 |
-  | DeepFM (autodis)  | 1     | 0.7979 |
+  | Model             | Epoch | AUC     |
+  | ----------------- | ----- | ------- |
+  | FM                | 1     | 0.7577  |
+  | DeepFM            | 1     | 0.7970  |
+  | DeepFM (backbone) | 1     | 0.7970  |
+  | DeepFM (periodic) | 1     | 0.7980  |
+  | DeepFM (autodis)  | 1     | 0.7979  |
   | DLRM              | 1     | 0.79785 |
-  | DLRM (backbone)   | 1     | 0.7993 |
-  | DLRM (standard)   | 1     | 0.7949 |
-  | DLRM (autodis)    | 1     | 0.7984 |
+  | DLRM (backbone)   | 1     | 0.7993  |
+  | DLRM (standard)   | 1     | 0.7949  |
+  | DLRM (autodis)    | 1     | 0.7989  |
+  | DLRM (periodic)   | 1     | 0.7998  |
 
 ### 召回模型
 

From 9234140a7f8ebee9232b524bbbfddbd68d85a074 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Sun, 18 Jun 2023 19:28:18 +0800
Subject: [PATCH 35/54] [feat]: add more backbone blocks

---
 easy_rec/python/layers/backbone.py            |  14 +-
 easy_rec/python/layers/common_layers.py       |   2 +-
 easy_rec/python/layers/keras/__init__.py      |   5 +-
 easy_rec/python/layers/keras/blocks.py        |  33 ++-
 easy_rec/python/layers/keras/bst.py           |   2 +-
 easy_rec/python/layers/keras/dcn.py           |   9 +-
 easy_rec/python/layers/keras/din.py           |   3 +-
 easy_rec/python/utils/load_class.py           |   2 +-
 .../configs/dcn_backbone_on_movielens.config  | 250 ++++++++++++++++++
 examples/readme.md                            |   1 +
 10 files changed, 292 insertions(+), 29 deletions(-)
 create mode 100644 examples/configs/dcn_backbone_on_movielens.config

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 139e31fee..d5fac8a49 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -3,6 +3,7 @@
 import logging
 
 import tensorflow as tf
+from google.protobuf import struct_pb2
 
 from easy_rec.python.layers.common_layers import EnhancedInputLayer
 from easy_rec.python.layers.keras import MLP
@@ -10,7 +11,6 @@
 from easy_rec.python.protos import backbone_pb2
 from easy_rec.python.utils.dag import DAG
 from easy_rec.python.utils.load_class import load_keras_layer
-from google.protobuf import struct_pb2
 
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
@@ -127,7 +127,8 @@ def __call__(self, is_training, **kwargs):
         block_outputs[block] = output
       else:
         inputs = block_input(config, block_outputs)
-        block_outputs[block] = self.call_layer(inputs, config, block, is_training)
+        block_outputs[block] = self.call_layer(inputs, config, block,
+                                               is_training)
 
     temp = []
     for output in self._config.concat_blocks:
@@ -170,10 +171,10 @@ def call_keras_layer(self, layer_conf, inputs, name, training):
       return layer(inputs, training=training)
 
   def call_sequential_layers(self, inputs, layers, name, training):
-   output = inputs
-   for layer in layers:
-     output = self.call_layer(output, layer, name, training)
-   return output
+    output = inputs
+    for layer in layers:
+      output = self.call_layer(output, layer, name, training)
+    return output
 
   def call_layer(self, inputs, config, name, training):
     layer_name = config.WhichOneof('layer')
@@ -221,4 +222,3 @@ def convert_to_dict(struct):
   for key, value in struct.items():
     kwargs[str(key)] = format_value(value)
   return kwargs
-
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index 810654cf3..011efb061 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -94,7 +94,7 @@ def __init__(self, config, input_layer, feature_dict):
   def __call__(self, group, is_training, *args, **kwargs):
     if self._config.output_seq_and_normal_feature:
       seq_features, target_feature, target_features = self._input_layer(
-        self._feature_dict, group, is_combine=False)
+          self._feature_dict, group, is_combine=False)
       return seq_features, target_features
 
     features, feature_list = self._input_layer(self._feature_dict, group)
diff --git a/easy_rec/python/layers/keras/__init__.py b/easy_rec/python/layers/keras/__init__.py
index d0dda33cf..64cacf3c9 100644
--- a/easy_rec/python/layers/keras/__init__.py
+++ b/easy_rec/python/layers/keras/__init__.py
@@ -1,7 +1,8 @@
-from .blocks import MLP, Highway
+from .blocks import MLP
+from .blocks import Highway
 from .bst import BST
-from .din import DIN
 from .dcn import Cross
+from .din import DIN
 from .dot_interaction import DotInteraction
 from .fibinet import BiLinear
 from .fibinet import FiBiNet
diff --git a/easy_rec/python/layers/keras/blocks.py b/easy_rec/python/layers/keras/blocks.py
index 507723017..2c7f08403 100644
--- a/easy_rec/python/layers/keras/blocks.py
+++ b/easy_rec/python/layers/keras/blocks.py
@@ -2,9 +2,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 """Convenience blocks for building models."""
 import logging
-from easy_rec.python.utils.activation import get_activation
+
 import tensorflow as tf
 
+from easy_rec.python.utils.activation import get_activation
+
 
 class MLP(tf.keras.layers.Layer):
   """Sequential multi-layer perceptron (MLP) block.
@@ -31,9 +33,9 @@ def __init__(self, params, name='mlp', **kwargs):
     units = list(params.hidden_units)
     logging.info(
         'MLP(%s) units: %s, dropout: %r, activate=%s, use_bn=%r, final_bn=%r,'
-        ' final_activate=%s, bias=%r, initializer=%s, bn_after_activation=%r'
-        % (name, units, dropout_rate, activation, use_bn, use_final_bn,
-           final_activation, use_bias, initializer, use_bn_after_act))
+        ' final_activate=%s, bias=%r, initializer=%s, bn_after_activation=%r' %
+        (name, units, dropout_rate, activation, use_bn, use_final_bn,
+         final_activation, use_bias, initializer, use_bn_after_act))
 
     num_dropout = len(dropout_rate)
     self._sub_layers = []
@@ -41,13 +43,15 @@ def __init__(self, params, name='mlp', **kwargs):
       name = 'dnn_%d' % i
       drop_rate = dropout_rate[i] if i < num_dropout else 0.0
       self.add_rich_layer(num_units, use_bn, drop_rate, activation, initializer,
-                          use_bias, use_bn_after_act, name, params.l2_regularizer)
+                          use_bias, use_bn_after_act, name,
+                          params.l2_regularizer)
 
     n = len(units) - 1
     drop_rate = dropout_rate[n] if num_dropout > n else 0.0
     name = 'dnn_%d' % n
     self.add_rich_layer(units[-1], use_final_bn, drop_rate, final_activation,
-                        initializer, use_bias, use_bn_after_act, name, params.l2_regularizer)
+                        initializer, use_bias, use_bn_after_act, name,
+                        params.l2_regularizer)
 
   def add_rich_layer(self,
                      num_units,
@@ -70,7 +74,8 @@ def add_rich_layer(self,
       self._sub_layers.append(dense)
       # bn = tf.keras.layers.BatchNormalization(name='%s/bn' % name)
       # keras BN layer have a stale issue on some versions of tf
-      bn = lambda x, training: tf.layers.batch_normalization(x, training=training, name='%s/bn' % name)
+      bn = lambda x, training: tf.layers.batch_normalization(
+          x, training=training, name='%s/%s/bn' % (self.name, name))
       self._sub_layers.append(bn)
       act = tf.keras.layers.Activation(act_fn, name='%s/act' % name)
       self._sub_layers.append(act)
@@ -84,7 +89,8 @@ def add_rich_layer(self,
           name=name)
       self._sub_layers.append(dense)
       if use_bn and use_bn_after_activation:
-        bn = lambda x, training: tf.layers.batch_normalization(x, training=training, name='%s/bn' % name)
+        bn = lambda x, training: tf.layers.batch_normalization(
+            x, training=training, name='%s/%s/bn' % (self.name, name))
         self._sub_layers.append(bn)
 
     if 0.0 < dropout_rate < 1.0:
@@ -101,6 +107,7 @@ def call(self, x, training=None, **kwargs):
 
 
 class Highway(tf.keras.layers.Layer):
+
   def __init__(self, params, name='highway', **kwargs):
     super(Highway, self).__init__(name, **kwargs)
     params.check_required('emb_size')
@@ -111,7 +118,9 @@ def __init__(self, params, name='highway', **kwargs):
 
   def call(self, inputs, training=None, **kwargs):
     from easy_rec.python.layers.common_layers import highway
-    return highway(inputs, self.emb_size,
-                   activation=self.activation,
-                   num_layers=self.num_layers,
-                   dropout=self.dropout_rate if training else 0.0)
+    return highway(
+        inputs,
+        self.emb_size,
+        activation=self.activation,
+        num_layers=self.num_layers,
+        dropout=self.dropout_rate if training else 0.0)
diff --git a/easy_rec/python/layers/keras/bst.py b/easy_rec/python/layers/keras/bst.py
index 9492fda07..f8b876fb4 100644
--- a/easy_rec/python/layers/keras/bst.py
+++ b/easy_rec/python/layers/keras/bst.py
@@ -1,13 +1,13 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import tensorflow as tf
+from tensorflow.python.keras.layers import Layer
 
 from easy_rec.python.input.augment import input_aug_data
 from easy_rec.python.layers import multihead_cross_attention
 from easy_rec.python.loss.nce_loss import nce_loss
 from easy_rec.python.utils.activation import get_activation
 from easy_rec.python.utils.shape_utils import get_shape_list
-from tensorflow.python.keras.layers import Layer
 
 
 class BST(Layer):
diff --git a/easy_rec/python/layers/keras/dcn.py b/easy_rec/python/layers/keras/dcn.py
index 5fe4d4c42..9585893e5 100644
--- a/easy_rec/python/layers/keras/dcn.py
+++ b/easy_rec/python/layers/keras/dcn.py
@@ -4,6 +4,8 @@
 
 import tensorflow as tf
 
+from easy_rec.python.utils.activation import get_activation
+
 
 class Cross(tf.keras.layers.Layer):
   """Cross Layer in Deep & Cross Network to learn explicit feature interactions.
@@ -70,7 +72,8 @@ def __init__(self, params, **kwargs):
     self._diag_scale = params.get_or_default('diag_scale', 0.0)
     self._use_bias = params.get_or_default('use_bias', True)
     preactivation = params.get_or_default('preactivation', None)
-    self._preactivation = tf.keras.activations.get(preactivation)
+    preact = get_activation(preactivation)
+    self._preactivation = tf.keras.activations.get(preact)
     kernel_initializer = params.get_or_default('kernel_initializer',
                                                'truncated_normal')
     self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
@@ -89,7 +92,7 @@ def __init__(self, params, **kwargs):
               self._diag_scale))
 
   def build(self, input_shape):
-    last_dim = input_shape[-1]
+    last_dim = input_shape[0][-1]
 
     if self._projection_dim is None:
       self._dense = tf.keras.layers.Dense(
@@ -154,7 +157,7 @@ def call(self, inputs, **kwargs):
     else:
       prod_output = self._dense_v(self._dense_u(x))
 
-    prod_output = tf.cast(prod_output, self.compute_dtype)
+    # prod_output = tf.cast(prod_output, self.compute_dtype)
 
     if self._diag_scale:
       prod_output = prod_output + self._diag_scale * x
diff --git a/easy_rec/python/layers/keras/din.py b/easy_rec/python/layers/keras/din.py
index 686d23e00..cee57ac90 100644
--- a/easy_rec/python/layers/keras/din.py
+++ b/easy_rec/python/layers/keras/din.py
@@ -3,12 +3,11 @@
 import logging
 
 import tensorflow as tf
+from tensorflow.python.keras.layers import Layer
 
 from easy_rec.python.layers import dnn
 from easy_rec.python.utils.shape_utils import get_shape_list
 
-from tensorflow.python.keras.layers import Layer
-
 
 class DIN(Layer):
 
diff --git a/easy_rec/python/utils/load_class.py b/easy_rec/python/utils/load_class.py
index 0cf12c26f..9ac749c76 100644
--- a/easy_rec/python/utils/load_class.py
+++ b/easy_rec/python/utils/load_class.py
@@ -245,5 +245,5 @@ def load_keras_layer(name):
   except pydoc.ErrorDuringImport:
     print('load keras layer %s failed' % name)
     logging.error('load keras layer %s failed: %s' %
-                   (name, traceback.format_exc()))
+                  (name, traceback.format_exc()))
     return None, False
diff --git a/examples/configs/dcn_backbone_on_movielens.config b/examples/configs/dcn_backbone_on_movielens.config
new file mode 100644
index 000000000..f16337fdd
--- /dev/null
+++ b/examples/configs/dcn_backbone_on_movielens.config
@@ -0,0 +1,250 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/dcn_on_movieslen"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: false
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_class: 'RankModel'
+  feature_groups: {
+    group_name: 'all'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: "deep"
+      inputs {
+        name: 'all'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64]
+        }
+      }
+    }
+    blocks {
+      name: "cross1"
+      inputs {
+        name: 'all'
+        input_fn: 'lambda x: [x, x]'
+      }
+      keras_layer {
+        class_name: 'Cross'
+      }
+    }
+    blocks {
+      name: "cross2"
+      inputs {
+        name: 'all'
+      }
+      inputs {
+        name: 'cross1'
+      }
+      merge_inputs_into_list: true
+      keras_layer {
+        class_name: 'Cross'
+      }
+    }
+    blocks {
+      name: "cross3"
+      inputs {
+        name: 'all'
+      }
+      inputs {
+        name: 'cross2'
+      }
+      merge_inputs_into_list: true
+      keras_layer {
+        class_name: 'Cross'
+      }
+    }
+    blocks {
+      name: "cross4"
+      inputs {
+        name: 'all'
+      }
+      inputs {
+        name: 'cross3'
+      }
+      merge_inputs_into_list: true
+      keras_layer {
+        class_name: 'Cross'
+      }
+    }
+    blocks {
+      name: "cross5"
+      inputs {
+        name: 'all'
+      }
+      inputs {
+        name: 'cross4'
+      }
+      merge_inputs_into_list: true
+      keras_layer {
+        class_name: 'Cross'
+      }
+    }
+    concat_blocks: ['deep', 'cross5']
+    top_mlp {
+      hidden_units: [64, 32, 16]
+    }
+  }
+  rank_model {
+    l2_regularization: 1e-4
+  }
+  embedding_regularization: 1e-4
+}
+export_config {
+  multi_placeholder: false
+}
diff --git a/examples/readme.md b/examples/readme.md
index d33304faf..55bfb4cba 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -215,6 +215,7 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
   | DeepFM           | 1     | 0.8867 |
   | DeepFM(Backbone) | 1     | 0.8872 |
   | DCN              | 1     | 0.8576 |
+  | DCN (Backbone)   | 1     | 0.8770 |
   | AutoInt          | 1     | 0.8513 |
   | MaskNet          | 1     | 0.8872 |
   | FibiNet          | 1     | 0.8879 |

From 7d0e350eac99280cf5bfcc0ef4f7ae1d6618d616 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 19 Jun 2023 12:59:17 +0800
Subject: [PATCH 36/54] [feat]: add more backbone blocks

---
 easy_rec/python/layers/backbone.py            |  67 +++--
 easy_rec/python/model/easy_rec_model.py       |   7 +-
 easy_rec/python/model/esmm.py                 |   8 +-
 easy_rec/python/model/mmoe.py                 |   5 +-
 easy_rec/python/model/ple.py                  |   5 +-
 easy_rec/python/model/simple_multi_task.py    |   5 +-
 easy_rec/python/protos/backbone.proto         |  31 ++-
 .../configs/dcn_backbone_on_movielens.config  |  64 +----
 examples/configs/mlp_on_movielens.config      | 239 ++++++++++++++++++
 ...wide_and_deep_backbone_on_movielens.config | 216 ++++++++++++++++
 examples/readme.md                            |  24 +-
 11 files changed, 571 insertions(+), 100 deletions(-)
 create mode 100644 examples/configs/mlp_on_movielens.config
 create mode 100644 examples/configs/wide_and_deep_backbone_on_movielens.config

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index d5fac8a49..7eee14a4d 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -2,6 +2,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import logging
 
+import six
 import tensorflow as tf
 from google.protobuf import struct_pb2
 
@@ -9,6 +10,7 @@
 from easy_rec.python.layers.keras import MLP
 from easy_rec.python.layers.utils import Parameter
 from easy_rec.python.protos import backbone_pb2
+from easy_rec.python.protos import keras_layer_pb2
 from easy_rec.python.utils.dag import DAG
 from easy_rec.python.utils.load_class import load_keras_layer
 
@@ -112,6 +114,14 @@ def __call__(self, is_training, **kwargs):
     print('backbone topological order: ' + ','.join(blocks))
     for block in blocks:
       config = self._name_to_blocks[block]
+      if config.layers:  # sequential layers
+        logging.info('call sequential %d layers' % len(config.layers))
+        output = block_input(config, block_outputs)
+        for layer in config.layers:
+          output = self.call_layer(output, layer, block, is_training)
+        block_outputs[block] = output
+        continue
+      # just one of layer
       layer = config.WhichOneof('layer')
       if layer is None:  # identity layer
         block_outputs[block] = block_input(config, block_outputs)
@@ -121,14 +131,11 @@ def __call__(self, is_training, **kwargs):
         output = input_fn(block, is_training)
         block_outputs[block] = output
       elif layer == 'sequential':
-        inputs = block_input(config, block_outputs)
-        layers = config.sequential.layers
-        output = self.call_sequential_layers(inputs, layers, block, is_training)
-        block_outputs[block] = output
+        print(config)
       else:
         inputs = block_input(config, block_outputs)
-        block_outputs[block] = self.call_layer(inputs, config, block,
-                                               is_training)
+        output = self.call_layer(inputs, config, block, is_training)
+        block_outputs[block] = output
 
     temp = []
     for output in self._config.concat_blocks:
@@ -166,16 +173,19 @@ def call_keras_layer(self, layer_conf, inputs, name, training):
         layer = layer_cls(name=name)
       else:
         assert param_type == 'st_params', 'internal keras layer only support st_params'
-        kwargs = convert_to_dict(layer_conf.st_params)
-        layer = layer_cls(name=name, **kwargs)
+        try:
+          kwargs = convert_to_dict(layer_conf.st_params)
+          logging.info('call %s layer with params %r' %
+                       (layer_conf.class_name, kwargs))
+          layer = layer_cls(name=name, **kwargs)
+        except TypeError as e:
+          logging.warning(e)
+          args = map(format_value, layer_conf.st_params.values())
+          logging.info('try to call %s layer with params %r' %
+                       (layer_conf.class_name, args))
+          layer = layer_cls(*args, name=name)
       return layer(inputs, training=training)
 
-  def call_sequential_layers(self, inputs, layers, name, training):
-    output = inputs
-    for layer in layers:
-      output = self.call_layer(output, layer, name, training)
-    return output
-
   def call_layer(self, inputs, config, name, training):
     layer_name = config.WhichOneof('layer')
     if layer_name == 'keras_layer':
@@ -184,6 +194,33 @@ def call_layer(self, inputs, config, name, training):
       conf = getattr(config, 'lambda')
       fn = eval(conf.expression)
       return fn(inputs)
+    if layer_name == 'recurrent':
+      conf = config.recurrent
+      fixed_input_index = -1
+      if conf.HasField('fixed_input_index'):
+        fixed_input_index = conf.fixed_input_index
+      if fixed_input_index >= 0:
+        assert type(inputs) in (tuple, list), '%s inputs must be a list'
+      output = inputs
+      for i in range(conf.num_steps):
+        name_i = '%s_%d' % (name, i)
+        output_i = self.call_keras_layer(conf.keras_layer, output, name_i, training)
+        if fixed_input_index >= 0:
+          j = 0
+          for idx in range(len(output)):
+            if idx == fixed_input_index:
+              continue
+            output[idx] = output_i[j] if type(output_i) in (tuple, list) else output_i
+            j += 1
+        else:
+          output = output_i
+      if fixed_input_index >= 0:
+        del output[fixed_input_index]
+        if len(output) == 1:
+          return output[0]
+        return output
+      return output
+
     raise NotImplementedError('Unsupported backbone layer:' + layer_name)
 
 
@@ -205,7 +242,7 @@ def concat_inputs(inputs, axis=-1, msg=''):
 
 def format_value(value):
   value_type = type(value)
-  if value_type in (unicode, str):
+  if value_type == six.text_type:
     return str(value)
   if value_type == float:
     int_v = int(value)
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index b114d0788..fe9a20ef8 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -68,6 +68,7 @@ def __init__(self,
     #                                          model_config.feature_groups,
     #                                          self._l2_reg)
     # self._sequence_encoding_by_group_name = {}
+    self._backbone_output = None
     if model_config.HasField('backbone'):
       self._backbone = Backbone(
           model_config.backbone,
@@ -83,11 +84,13 @@ def has_backbone(self):
 
   @property
   def backbone(self):
+    if self._backbone_output:
+      return self._backbone_output
     if self._backbone:
-      output = self._backbone(self._is_training)
+      self._backbone_output = self._backbone(self._is_training)
       loss_dict = self._backbone.loss_dict
       self._loss_dict.update(loss_dict)
-      return output
+      return self._backbone_output
     return None
 
   @property
diff --git a/easy_rec/python/model/esmm.py b/easy_rec/python/model/esmm.py
index c6eaad483..50567ae63 100644
--- a/easy_rec/python/model/esmm.py
+++ b/easy_rec/python/model/esmm.py
@@ -31,7 +31,9 @@ def __init__(self,
 
     self._group_num = len(self._model_config.groups)
     self._group_features = []
-    if self._group_num > 0:
+    if self.has_backbone:
+      logging.info('use bottom backbone network')
+    elif self._group_num > 0:
       logging.info('group_num: {0}'.format(self._group_num))
       for group_id in range(self._group_num):
         group = self._model_config.groups[group_id]
@@ -173,7 +175,9 @@ def build_predict_graph(self):
     Returns:
       self._prediction_dict: Prediction result of two tasks.
     """
-    if self._group_num > 0:
+    if self.has_backbone:
+      all_fea = self.backbone
+    elif self._group_num > 0:
       group_fea_arr = []
       # Both towers share the underlying network.
       for group_id in range(self._group_num):
diff --git a/easy_rec/python/model/mmoe.py b/easy_rec/python/model/mmoe.py
index acf1d6d59..3cc644f6d 100644
--- a/easy_rec/python/model/mmoe.py
+++ b/easy_rec/python/model/mmoe.py
@@ -26,7 +26,10 @@ def __init__(self,
     self._model_config = self._model_config.mmoe
     assert isinstance(self._model_config, MMoEConfig)
 
-    self._features, _ = self._input_layer(self._feature_dict, 'all')
+    if self.has_backbone:
+      self._features = self.backbone
+    else:
+      self._features, _ = self._input_layer(self._feature_dict, 'all')
     self._init_towers(self._model_config.task_towers)
 
   def build_predict_graph(self):
diff --git a/easy_rec/python/model/ple.py b/easy_rec/python/model/ple.py
index f3ad71215..e04781bcd 100644
--- a/easy_rec/python/model/ple.py
+++ b/easy_rec/python/model/ple.py
@@ -27,7 +27,10 @@ def __init__(self,
 
     self._layer_nums = len(self._model_config.extraction_networks)
     self._task_nums = len(self._model_config.task_towers)
-    self._features, _ = self._input_layer(self._feature_dict, 'all')
+    if self.has_backbone:
+      self._features = self.backbone
+    else:
+      self._features, _ = self._input_layer(self._feature_dict, 'all')
     self._init_towers(self._model_config.task_towers)
 
   def gate(self, selector_fea, vec_feas, name):
diff --git a/easy_rec/python/model/simple_multi_task.py b/easy_rec/python/model/simple_multi_task.py
index b4c0613bc..05dd7a773 100644
--- a/easy_rec/python/model/simple_multi_task.py
+++ b/easy_rec/python/model/simple_multi_task.py
@@ -27,7 +27,10 @@ def __init__(self,
     self._model_config = self._model_config.simple_multi_task
     assert isinstance(self._model_config, SimpleMultiTaskConfig)
 
-    self._features, _ = self._input_layer(self._feature_dict, 'all')
+    if self.has_backbone:
+      self._features = self.backbone
+    else:
+      self._features, _ = self._input_layer(self._feature_dict, 'all')
     self._init_towers(self._model_config.task_towers)
 
   def build_predict_graph(self):
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
index a11944d95..6f292a48d 100644
--- a/easy_rec/python/protos/backbone.proto
+++ b/easy_rec/python/protos/backbone.proto
@@ -24,6 +24,20 @@ message Input {
     optional string input_fn = 2;
 }
 
+message RecurrentLayer {
+    required uint32 num_steps = 1 [default = 1];
+    optional uint32 fixed_input_index = 2;
+    required KerasLayer keras_layer = 3;
+}
+
+message Layer {
+    oneof layer {
+        Lambda lambda = 1;
+        KerasLayer keras_layer = 2;
+        RecurrentLayer recurrent = 3;
+    }
+}
+
 message Block {
     required string name = 1;
     // the input names of feature groups or other blocks
@@ -31,11 +45,15 @@ message Block {
     optional int32 input_concat_axis = 3 [default = -1];
     optional bool merge_inputs_into_list = 4;
     optional string extra_input_fn = 5;
+
+    // sequential layers
+    repeated Layer layers = 6;
+    // only take effect when there are no layers
     oneof layer {
         InputLayer input_layer = 101;
         Lambda lambda = 102;
         KerasLayer keras_layer = 103;
-        Sequential sequential = 104;
+        RecurrentLayer recurrent = 104;
     }
 }
 
@@ -44,14 +62,3 @@ message BackboneTower {
     repeated string concat_blocks = 2;
     optional MLP top_mlp = 3;
 }
-
-message Layer {
-    oneof layer {
-        Lambda lambda = 101;
-        KerasLayer keras_layer = 102;
-    }
-}
-
-message Sequential {
-    repeated Layer layers = 1;
-}
diff --git a/examples/configs/dcn_backbone_on_movielens.config b/examples/configs/dcn_backbone_on_movielens.config
index f16337fdd..9c84794dd 100644
--- a/examples/configs/dcn_backbone_on_movielens.config
+++ b/examples/configs/dcn_backbone_on_movielens.config
@@ -174,68 +174,20 @@ model_config: {
       }
     }
     blocks {
-      name: "cross1"
+      name: "dcn"
       inputs {
         name: 'all'
         input_fn: 'lambda x: [x, x]'
       }
-      keras_layer {
-        class_name: 'Cross'
-      }
-    }
-    blocks {
-      name: "cross2"
-      inputs {
-        name: 'all'
-      }
-      inputs {
-        name: 'cross1'
-      }
-      merge_inputs_into_list: true
-      keras_layer {
-        class_name: 'Cross'
-      }
-    }
-    blocks {
-      name: "cross3"
-      inputs {
-        name: 'all'
-      }
-      inputs {
-        name: 'cross2'
-      }
-      merge_inputs_into_list: true
-      keras_layer {
-        class_name: 'Cross'
-      }
-    }
-    blocks {
-      name: "cross4"
-      inputs {
-        name: 'all'
-      }
-      inputs {
-        name: 'cross3'
-      }
-      merge_inputs_into_list: true
-      keras_layer {
-        class_name: 'Cross'
-      }
-    }
-    blocks {
-      name: "cross5"
-      inputs {
-        name: 'all'
-      }
-      inputs {
-        name: 'cross4'
-      }
-      merge_inputs_into_list: true
-      keras_layer {
-        class_name: 'Cross'
+      recurrent {
+        num_steps: 3
+        fixed_input_index: 0
+        keras_layer {
+          class_name: 'Cross'
+        }
       }
     }
-    concat_blocks: ['deep', 'cross5']
+    concat_blocks: ['deep', 'dcn']
     top_mlp {
       hidden_units: [64, 32, 16]
     }
diff --git a/examples/configs/mlp_on_movielens.config b/examples/configs/mlp_on_movielens.config
new file mode 100644
index 000000000..392f392ef
--- /dev/null
+++ b/examples/configs/mlp_on_movielens.config
@@ -0,0 +1,239 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/mlp_movieslen"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_class: "RankModel"
+  feature_groups: {
+    group_name: 'features'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: 'mlp'
+      inputs {
+        name: 'features'
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dense'
+          st_params {
+            fields {
+              key: 'units'
+              value: { number_value: 256 }
+            }
+            fields {
+              key: 'activation'
+              value: { string_value: 'relu' }
+            }
+          }
+        }
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dropout'
+          st_params {
+            fields {
+              key: 'rate'
+              value: { number_value: 0.5 }
+            }
+          }
+        }
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dense'
+          st_params {
+            fields {
+              key: 'units'
+              value: { number_value: 256 }
+            }
+            fields {
+              key: 'activation'
+              value: { string_value: 'relu' }
+            }
+          }
+        }
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dropout'
+          st_params {
+            fields {
+              key: 'rate'
+              value: { number_value: 0.5 }
+            }
+          }
+        }
+      }
+      layers {
+        keras_layer {
+          class_name: 'Dense'
+          st_params {
+            fields {
+              key: 'units'
+              value: { number_value: 1 }
+            }
+          }
+        }
+      }
+    }
+    concat_blocks: 'mlp'
+  }
+  rank_model {
+    l2_regularization: 1e-4
+  }
+  embedding_regularization: 1e-4
+}
diff --git a/examples/configs/wide_and_deep_backbone_on_movielens.config b/examples/configs/wide_and_deep_backbone_on_movielens.config
new file mode 100644
index 000000000..dddc91888
--- /dev/null
+++ b/examples/configs/wide_and_deep_backbone_on_movielens.config
@@ -0,0 +1,216 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/wide_and_deep_movieslen"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_class: "RankModel"
+  feature_groups: {
+    group_name: 'wide'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: WIDE
+  }
+  feature_groups: {
+    group_name: 'deep'
+    feature_names: 'user_id'
+    feature_names: 'movie_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    blocks {
+      name: 'wide'
+      input_layer {
+        only_output_feature_list: true
+      }
+    }
+    blocks {
+      name: 'deep_logit'
+      inputs {
+        name: 'deep'
+      }
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 256, 256, 1]
+          use_final_bn: false
+          final_activation: 'linear'
+        }
+      }
+    }
+    blocks {
+      name: 'final_logit'
+      inputs {
+        name: 'wide'
+        input_fn: 'lambda x: tf.add_n(x)'
+      }
+      inputs {
+        name: 'deep_logit'
+      }
+      merge_inputs_into_list: true
+      keras_layer {
+        class_name: 'Add'
+      }
+    }
+    concat_blocks: 'final_logit'
+  }
+  rank_model {
+    wide_output_dim: 1
+    l2_regularization: 1e-4
+  }
+  embedding_regularization: 1e-4
+}
diff --git a/examples/readme.md b/examples/readme.md
index 55bfb4cba..ba4f57cce 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -209,16 +209,20 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
 
 - MovieLens-1M
 
-  | Model            | Epoch | AUC    |
-  | ---------------- | ----- | ------ |
-  | Wide&Deep        | 1     | 0.8558 |
-  | DeepFM           | 1     | 0.8867 |
-  | DeepFM(Backbone) | 1     | 0.8872 |
-  | DCN              | 1     | 0.8576 |
-  | DCN (Backbone)   | 1     | 0.8770 |
-  | AutoInt          | 1     | 0.8513 |
-  | MaskNet          | 1     | 0.8872 |
-  | FibiNet          | 1     | 0.8879 |
+  | Model               | Epoch | AUC    |
+  | ------------------- | ----- | ------ |
+  | MLP                 | 1     | 0.8616 |
+  | Wide&Deep           | 1     | 0.8558 |
+  | Wide&Deep(Backbone) | 1     | 0.8854 |
+  | DeepFM              | 1     | 0.8867 |
+  | DeepFM(Backbone)    | 1     | 0.8872 |
+  | DCN                 | 1     | 0.8576 |
+  | DCN (Backbone)      | 1     | 0.8770 |
+  | AutoInt             | 1     | 0.8513 |
+  | MaskNet             | 1     | 0.8872 |
+  | FibiNet             | 1     | 0.8879 |
+
+  备注：`MovieLens-1M` 数据集较小，评估指标方差较大，以上结果仅供参考。
 
 - Criteo-Research
 

From 136cf37ce92d54fb5255ded0f59d3771c8ee5673 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 19 Jun 2023 15:14:14 +0800
Subject: [PATCH 37/54] [feat]: format backbone code, add recurrent and
 sequential layer

---
 easy_rec/python/layers/backbone.py            |   9 +-
 easy_rec/python/layers/keras/__init__.py      |   6 +-
 .../python/layers/keras/dot_interaction.py    |  89 -------------
 easy_rec/python/layers/keras/fibinet.py       |  66 +++++----
 easy_rec/python/layers/keras/fm.py            |  46 -------
 .../layers/keras/{dcn.py => interaction.py}   | 125 +++++++++++++++++-
 easy_rec/python/layers/keras/mask_net.py      |   3 -
 easy_rec/python/model/cmbf.py                 |   2 +-
 .../model/collaborative_metric_learning.py    |  86 ++++++------
 easy_rec/python/model/dcn.py                  |   2 +-
 easy_rec/python/model/deepfm.py               |   6 +-
 easy_rec/python/model/easy_rec_model.py       |   1 -
 easy_rec/python/model/multi_tower.py          |   2 +-
 easy_rec/python/model/multi_tower_bst.py      |   2 +-
 easy_rec/python/model/multi_tower_din.py      |   2 +-
 easy_rec/python/model/multi_tower_recall.py   |   2 +-
 easy_rec/python/model/uniter.py               |   2 +-
 easy_rec/python/model/wide_and_deep.py        |   8 +-
 18 files changed, 232 insertions(+), 227 deletions(-)
 delete mode 100644 easy_rec/python/layers/keras/dot_interaction.py
 delete mode 100644 easy_rec/python/layers/keras/fm.py
 rename easy_rec/python/layers/keras/{dcn.py => interaction.py} (59%)

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 7eee14a4d..22645bee0 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -10,7 +10,6 @@
 from easy_rec.python.layers.keras import MLP
 from easy_rec.python.layers.utils import Parameter
 from easy_rec.python.protos import backbone_pb2
-from easy_rec.python.protos import keras_layer_pb2
 from easy_rec.python.utils.dag import DAG
 from easy_rec.python.utils.load_class import load_keras_layer
 
@@ -204,13 +203,17 @@ def call_layer(self, inputs, config, name, training):
       output = inputs
       for i in range(conf.num_steps):
         name_i = '%s_%d' % (name, i)
-        output_i = self.call_keras_layer(conf.keras_layer, output, name_i, training)
+        layer = conf.keras_layer
+        output_i = self.call_keras_layer(layer, output, name_i, training)
         if fixed_input_index >= 0:
           j = 0
           for idx in range(len(output)):
             if idx == fixed_input_index:
               continue
-            output[idx] = output_i[j] if type(output_i) in (tuple, list) else output_i
+            if type(output_i) in (tuple, list):
+              output[idx] = output_i[j]
+            else:
+              output[idx] = output_i
             j += 1
         else:
           output = output_i
diff --git a/easy_rec/python/layers/keras/__init__.py b/easy_rec/python/layers/keras/__init__.py
index 64cacf3c9..24f62ffb3 100644
--- a/easy_rec/python/layers/keras/__init__.py
+++ b/easy_rec/python/layers/keras/__init__.py
@@ -1,13 +1,13 @@
 from .blocks import MLP
 from .blocks import Highway
 from .bst import BST
-from .dcn import Cross
 from .din import DIN
-from .dot_interaction import DotInteraction
 from .fibinet import BiLinear
 from .fibinet import FiBiNet
 from .fibinet import SENet
-from .fm import FM
+from .interaction import FM
+from .interaction import Cross
+from .interaction import DotInteraction
 from .mask_net import MaskBlock
 from .mask_net import MaskNet
 from .numerical_embedding import AutoDisEmbedding
diff --git a/easy_rec/python/layers/keras/dot_interaction.py b/easy_rec/python/layers/keras/dot_interaction.py
deleted file mode 100644
index 7ec47c5ad..000000000
--- a/easy_rec/python/layers/keras/dot_interaction.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# -*- encoding:utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-"""Implements `Dot Interaction` Layer of DLRM model."""
-
-import tensorflow as tf
-
-
-class DotInteraction(tf.keras.layers.Layer):
-  """Dot interaction layer.
-
-  See theory in the DLRM paper: https://arxiv.org/pdf/1906.00091.pdf,
-  section 2.1.3. Sparse activations and dense activations are combined.
-  Dot interaction is applied to a batch of input Tensors [e1,...,e_k] of the
-  same dimension and the output is a batch of Tensors with all distinct pairwise
-  dot products of the form dot(e_i, e_j) for i <= j if self self_interaction is
-  True, otherwise dot(e_i, e_j) i < j.
-
-  Attributes:
-    self_interaction: Boolean indicating if features should self-interact.
-      If it is True, then the diagonal entries of the interaction metric are
-      also taken.
-    skip_gather: An optimization flag. If it's set then the upper triangle part
-      of the dot interaction matrix dot(e_i, e_j) is set to 0. The resulting
-      activations will be of dimension [num_features * num_features] from which
-      half will be zeros. Otherwise activations will be only lower triangle part
-      of the interaction matrix. The later saves space but is much slower.
-    name: String name of the layer.
-  """
-
-  def __init__(self, params, name=None, **kwargs):
-    self._self_interaction = params.get_or_default('self_interaction', False)
-    self._skip_gather = params.get_or_default('skip_gather', False)
-    super(DotInteraction, self).__init__(name=name, **kwargs)
-
-  def call(self, inputs, **kwargs):
-    """Performs the interaction operation on the tensors in the list.
-
-    The tensors represent as transformed dense features and embedded categorical
-    features.
-    Pre-condition: The tensors should all have the same shape.
-
-    Args:
-      inputs: List of features with shapes [batch_size, feature_dim].
-
-    Returns:
-      activations: Tensor representing interacted features. It has a dimension
-      `num_features * num_features` if skip_gather is True, otherside
-      `num_features * (num_features + 1) / 2` if self_interaction is True and
-      `num_features * (num_features - 1) / 2` if self_interaction is False.
-    """
-    if isinstance(inputs, (list, tuple)):
-      # concat_features shape: batch_size, num_features, feature_dim
-      try:
-        concat_features = tf.stack(inputs, axis=1)
-      except (ValueError, tf.errors.InvalidArgumentError) as e:
-        raise ValueError('Input tensors` dimensions must be equal, original'
-                         'error message: {}'.format(e))
-    else:
-      assert inputs.shape.ndims == 3, 'input of dot func must be a 3D tensor or a list of 2D tensors'
-      concat_features = inputs
-
-    batch_size = tf.shape(concat_features)[0]
-
-    # Interact features, select lower-triangular portion, and re-shape.
-    xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
-    num_features = xactions.shape[-1]
-    ones = tf.ones_like(xactions)
-    if self._self_interaction:
-      # Selecting lower-triangular portion including the diagonal.
-      lower_tri_mask = tf.linalg.band_part(ones, -1, 0)
-      upper_tri_mask = ones - lower_tri_mask
-      out_dim = num_features * (num_features + 1) // 2
-    else:
-      # Selecting lower-triangular portion not included the diagonal.
-      upper_tri_mask = tf.linalg.band_part(ones, 0, -1)
-      lower_tri_mask = ones - upper_tri_mask
-      out_dim = num_features * (num_features - 1) // 2
-
-    if self._skip_gather:
-      # Setting upper triangle part of the interaction matrix to zeros.
-      activations = tf.where(
-          condition=tf.cast(upper_tri_mask, tf.bool),
-          x=tf.zeros_like(xactions),
-          y=xactions)
-      out_dim = num_features * num_features
-    else:
-      activations = tf.boolean_mask(xactions, lower_tri_mask)
-    activations = tf.reshape(activations, (batch_size, out_dim))
-    return activations
diff --git a/easy_rec/python/layers/keras/fibinet.py b/easy_rec/python/layers/keras/fibinet.py
index dc1f7d003..98cdb3179 100644
--- a/easy_rec/python/layers/keras/fibinet.py
+++ b/easy_rec/python/layers/keras/fibinet.py
@@ -5,7 +5,6 @@
 
 import tensorflow as tf
 
-from easy_rec.python.layers import dnn
 from easy_rec.python.layers.common_layers import layer_norm
 from easy_rec.python.layers.keras.blocks import MLP
 from easy_rec.python.layers.utils import Parameter
@@ -15,9 +14,20 @@
 
 
 class SENet(tf.keras.layers.Layer):
-  """SENet+ Layer used in FiBiNET，支持不同field的embedding dimension不等.
+  """SENET Layer used in FiBiNET.
 
-  arxiv: 2209.05016
+  Input shape
+    - A list of 2D tensor with shape: ``(batch_size,embedding_size)``.
+      The ``embedding_size`` of each field can have different value.
+
+  Output shape
+    - A 2D tensor with shape: ``(batch_size,sum_of_embedding_size)``.
+
+  References:
+    1. [FiBiNET](https://arxiv.org/pdf/1905.09433.pdf)
+      Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction
+    2. [FiBiNet++](https://arxiv.org/pdf/2209.05016.pdf)
+      Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction
   """
 
   def __init__(self, params, name='SENet', **kwargs):
@@ -25,8 +35,6 @@ def __init__(self, params, name='SENet', **kwargs):
     self.config = params.get_pb_config()
 
   def call(self, inputs, **kwargs):
-    """embedding_list:  - A list of 2D tensor with shape: ``(batch_size,embedding_size)``."""
-    print('SENET layer with %d inputs' % len(inputs))
     g = self.config.num_squeeze_group
     for emb in inputs:
       assert emb.shape.ndims == 2, 'field embeddings must be rank 2 tensors'
@@ -88,14 +96,26 @@ def _full_interaction(v_i, v_j):
 
 
 class BiLinear(tf.keras.layers.Layer):
-  """双线性特征交互层，支持不同field embeddings的size不等.
+  """BilinearInteraction Layer used in FiBiNET.
+
+  Input shape
+    - A list of 2D tensor with shape: ``(batch_size,embedding_size)``.
+      Its length is ``filed_size``.
+      The ``embedding_size`` of each field can have different value.
 
-  arxiv: 2209.05016
+  Output shape
+    - 2D tensor with shape: ``(batch_size,output_size)``.
 
   Attributes:
-    num_output_units: 输出的size
-    type: ['all', 'each', 'interaction']，支持其中一种
-    use_plus: 是否使用bi-linear+
+    num_output_units: the number of output units
+    type: ['all', 'each', 'interaction'], types of bilinear functions used in this layer
+    use_plus: whether to use bi-linear+
+
+  References:
+    1. [FiBiNET](https://arxiv.org/pdf/1905.09433.pdf)
+      Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction
+    2. [FiBiNet++](https://arxiv.org/pdf/2209.05016.pdf)
+      Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction
   """
 
   def __init__(self, params, name='bilinear', **kwargs):
@@ -186,36 +206,32 @@ def call(self, inputs, **kwargs):
 class FiBiNet(tf.keras.layers.Layer):
   """FiBiNet++:Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction.
 
-  This is almost an exact implementation of the original FiBiNet++ model.
-  See the original paper:
-  https://arxiv.org/pdf/2209.05016.pdf
+  References:
+    - [FiBiNet++](https://arxiv.org/pdf/2209.05016.pdf)
+      Improving FiBiNet by Greatly Reducing Model Size for CTR Prediction
   """
 
-  def __init__(self, params, name='fibinet', l2_reg=None, **kwargs):
+  def __init__(self, params, name='fibinet', **kwargs):
     super(FiBiNet, self).__init__(name, **kwargs)
     self._config = params.get_pb_config()
     if self._config.HasField('mlp'):
-      # self.final_dnn = dnn.DNN(
-      #   self._config.mlp,
-      #   kwargs['l2_reg'] if 'l2_reg' in kwargs else None,
-      #   name='%s_fibinet_mlp' % self.name,
-      #   is_training=False)
       p = Parameter.make_from_pb(self._config.mlp)
-      self.final_dnn = MLP(p, name=name, l2_reg=l2_reg)
+      p.l2_regularizer = params.l2_regularizer
+      self.final_mlp = MLP(p, name=name)
     else:
-      self.final_dnn = None
+      self.final_mlp = None
 
   def call(self, inputs, training=None, **kwargs):
     feature_list = []
 
     params = Parameter.make_from_pb(self._config.senet)
-    senet = SENet(params, name='%s_senet' % self.name)
+    senet = SENet(params, name='%s/senet' % self.name)
     senet_output = senet(inputs)
     feature_list.append(senet_output)
 
     if self._config.HasField('bilinear'):
       params = Parameter.make_from_pb(self._config.bilinear)
-      bilinear = BiLinear(params, name='%s_bilinear' % self.name)
+      bilinear = BiLinear(params, name='%s/bilinear' % self.name)
       bilinear_output = bilinear(inputs)
       feature_list.append(bilinear_output)
 
@@ -224,6 +240,6 @@ def call(self, inputs, training=None, **kwargs):
     else:
       feature = feature_list[0]
 
-    if self.final_dnn is not None:
-      feature = self.final_dnn(feature, training=training)
+    if self.final_mlp is not None:
+      feature = self.final_mlp(feature, training=training)
     return feature
diff --git a/easy_rec/python/layers/keras/fm.py b/easy_rec/python/layers/keras/fm.py
deleted file mode 100644
index 56910541f..000000000
--- a/easy_rec/python/layers/keras/fm.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# -*- encoding:utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import tensorflow as tf
-
-if tf.__version__ >= '2.0':
-  tf = tf.compat.v1
-
-
-class FM(tf.keras.layers.Layer):
-  """Factorization Machine models pairwise (order-2) feature interactions without linear term and bias.
-
-  References
-    - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
-  Input shape.
-    - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
-    - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
-  Output shape
-    - 2D tensor with shape: ``(batch_size, 1)``.
-  """
-
-  def __init__(self, params, name='fm', **kwargs):
-    super(FM, self).__init__(name, **kwargs)
-    self.use_variant = params.get_or_default('use_variant', False)
-
-  def call(self, inputs, **kwargs):
-    if type(inputs) == list:
-      emb_dims = set(map(lambda x: int(x.shape[-1]), inputs))
-      if len(emb_dims) != 1:
-        dims = ','.join([str(d) for d in emb_dims])
-        raise ValueError('all embedding dim must be equal in FM layer:' + dims)
-
-      with tf.name_scope(self.name):
-        fea = tf.stack(inputs, axis=1)
-    else:
-      assert inputs.shape.ndims == 3, 'input of FM layer must be a 3D tensor or a list of 2D tensors'
-      fea = inputs
-
-    with tf.name_scope(self.name):
-      square_of_sum = tf.square(tf.reduce_sum(fea, axis=1))
-      sum_of_square = tf.reduce_sum(tf.square(fea), axis=1)
-      cross_term = tf.subtract(square_of_sum, sum_of_square)
-      if self.use_variant:
-        cross_term = 0.5 * cross_term
-      else:
-        cross_term = 0.5 * tf.reduce_sum(cross_term, axis=-1, keepdims=True)
-    return cross_term
diff --git a/easy_rec/python/layers/keras/dcn.py b/easy_rec/python/layers/keras/interaction.py
similarity index 59%
rename from easy_rec/python/layers/keras/dcn.py
rename to easy_rec/python/layers/keras/interaction.py
index 9585893e5..55f56f7a1 100644
--- a/easy_rec/python/layers/keras/dcn.py
+++ b/easy_rec/python/layers/keras/interaction.py
@@ -1,12 +1,133 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
-"""Implements `Cross` Layer, the cross layer in Deep & Cross Network (DCN)."""
-
 import tensorflow as tf
 
 from easy_rec.python.utils.activation import get_activation
 
 
+class FM(tf.keras.layers.Layer):
+  """Factorization Machine models pairwise (order-2) feature interactions without linear term and bias.
+
+  References
+    - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+  Input shape.
+    - List of 2D tensor with shape: ``(batch_size,embedding_size)``.
+    - Or a 3D tensor with shape: ``(batch_size,field_size,embedding_size)``
+  Output shape
+    - 2D tensor with shape: ``(batch_size, 1)``.
+  """
+
+  def __init__(self, params, name='fm', **kwargs):
+    super(FM, self).__init__(name, **kwargs)
+    self.use_variant = params.get_or_default('use_variant', False)
+
+  def call(self, inputs, **kwargs):
+    if type(inputs) == list:
+      emb_dims = set(map(lambda x: int(x.shape[-1]), inputs))
+      if len(emb_dims) != 1:
+        dims = ','.join([str(d) for d in emb_dims])
+        raise ValueError('all embedding dim must be equal in FM layer:' + dims)
+      with tf.name_scope(self.name):
+        fea = tf.stack(inputs, axis=1)
+    else:
+      assert inputs.shape.ndims == 3, 'input of FM layer must be a 3D tensor or a list of 2D tensors'
+      fea = inputs
+
+    with tf.name_scope(self.name):
+      square_of_sum = tf.square(tf.reduce_sum(fea, axis=1))
+      sum_of_square = tf.reduce_sum(tf.square(fea), axis=1)
+      cross_term = tf.subtract(square_of_sum, sum_of_square)
+      if self.use_variant:
+        cross_term = 0.5 * cross_term
+      else:
+        cross_term = 0.5 * tf.reduce_sum(cross_term, axis=-1, keepdims=True)
+    return cross_term
+
+
+class DotInteraction(tf.keras.layers.Layer):
+  """Dot interaction layer of DLRM model..
+
+  See theory in the DLRM paper: https://arxiv.org/pdf/1906.00091.pdf,
+  section 2.1.3. Sparse activations and dense activations are combined.
+  Dot interaction is applied to a batch of input Tensors [e1,...,e_k] of the
+  same dimension and the output is a batch of Tensors with all distinct pairwise
+  dot products of the form dot(e_i, e_j) for i <= j if self self_interaction is
+  True, otherwise dot(e_i, e_j) i < j.
+
+  Attributes:
+    self_interaction: Boolean indicating if features should self-interact.
+      If it is True, then the diagonal entries of the interaction metric are
+      also taken.
+    skip_gather: An optimization flag. If it's set then the upper triangle part
+      of the dot interaction matrix dot(e_i, e_j) is set to 0. The resulting
+      activations will be of dimension [num_features * num_features] from which
+      half will be zeros. Otherwise activations will be only lower triangle part
+      of the interaction matrix. The later saves space but is much slower.
+    name: String name of the layer.
+  """
+
+  def __init__(self, params, name=None, **kwargs):
+    self._self_interaction = params.get_or_default('self_interaction', False)
+    self._skip_gather = params.get_or_default('skip_gather', False)
+    super(DotInteraction, self).__init__(name=name, **kwargs)
+
+  def call(self, inputs, **kwargs):
+    """Performs the interaction operation on the tensors in the list.
+
+    The tensors represent as transformed dense features and embedded categorical
+    features.
+    Pre-condition: The tensors should all have the same shape.
+
+    Args:
+      inputs: List of features with shapes [batch_size, feature_dim].
+
+    Returns:
+      activations: Tensor representing interacted features. It has a dimension
+      `num_features * num_features` if skip_gather is True, otherside
+      `num_features * (num_features + 1) / 2` if self_interaction is True and
+      `num_features * (num_features - 1) / 2` if self_interaction is False.
+    """
+    if isinstance(inputs, (list, tuple)):
+      # concat_features shape: batch_size, num_features, feature_dim
+      try:
+        concat_features = tf.stack(inputs, axis=1)
+      except (ValueError, tf.errors.InvalidArgumentError) as e:
+        raise ValueError('Input tensors` dimensions must be equal, original'
+                         'error message: {}'.format(e))
+    else:
+      assert inputs.shape.ndims == 3, 'input of dot func must be a 3D tensor or a list of 2D tensors'
+      concat_features = inputs
+
+    batch_size = tf.shape(concat_features)[0]
+
+    # Interact features, select lower-triangular portion, and re-shape.
+    xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
+    num_features = xactions.shape[-1]
+    ones = tf.ones_like(xactions)
+    if self._self_interaction:
+      # Selecting lower-triangular portion including the diagonal.
+      lower_tri_mask = tf.linalg.band_part(ones, -1, 0)
+      upper_tri_mask = ones - lower_tri_mask
+      out_dim = num_features * (num_features + 1) // 2
+    else:
+      # Selecting lower-triangular portion not included the diagonal.
+      upper_tri_mask = tf.linalg.band_part(ones, 0, -1)
+      lower_tri_mask = ones - upper_tri_mask
+      out_dim = num_features * (num_features - 1) // 2
+
+    if self._skip_gather:
+      # Setting upper triangle part of the interaction matrix to zeros.
+      activations = tf.where(
+          condition=tf.cast(upper_tri_mask, tf.bool),
+          x=tf.zeros_like(xactions),
+          y=xactions)
+      out_dim = num_features * num_features
+    else:
+      activations = tf.boolean_mask(xactions, lower_tri_mask)
+    activations = tf.reshape(activations, (batch_size, out_dim))
+    return activations
+
+
 class Cross(tf.keras.layers.Layer):
   """Cross Layer in Deep & Cross Network to learn explicit feature interactions.
 
diff --git a/easy_rec/python/layers/keras/mask_net.py b/easy_rec/python/layers/keras/mask_net.py
index 8749a1ee8..2e66beb22 100644
--- a/easy_rec/python/layers/keras/mask_net.py
+++ b/easy_rec/python/layers/keras/mask_net.py
@@ -6,9 +6,6 @@
 from easy_rec.python.layers.keras.blocks import MLP
 from easy_rec.python.layers.utils import Parameter
 
-if tf.__version__ >= '2.0':
-  tf = tf.compat.v1
-
 
 class MaskBlock(tf.keras.layers.Layer):
 
diff --git a/easy_rec/python/model/cmbf.py b/easy_rec/python/model/cmbf.py
index 0f0a8f3aa..a11a30582 100644
--- a/easy_rec/python/model/cmbf.py
+++ b/easy_rec/python/model/cmbf.py
@@ -38,7 +38,7 @@ def __init__(self,
 
   def build_predict_graph(self):
     hidden = self._cmbf_layer(self._is_training, l2_reg=self._l2_reg)
-    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(hidden)
 
diff --git a/easy_rec/python/model/collaborative_metric_learning.py b/easy_rec/python/model/collaborative_metric_learning.py
index d785e7141..b19537239 100644
--- a/easy_rec/python/model/collaborative_metric_learning.py
+++ b/easy_rec/python/model/collaborative_metric_learning.py
@@ -48,21 +48,22 @@ def __init__(
       raise ValueError('unsupported loss type: %s' %
                        LossType.Name(self._loss_type))
 
-    self._highway_features = {}
-    self._highway_num = len(self._model_config.highway)
-    for _id in range(self._highway_num):
-      highway_cfg = self._model_config.highway[_id]
-      highway_feature, _ = self._input_layer(self._feature_dict,
-                                             highway_cfg.input)
-      self._highway_features[highway_cfg.input] = highway_feature
-
-    self.input_features = []
-    if self._model_config.HasField('input'):
-      input_feature, _ = self._input_layer(self._feature_dict,
-                                           self._model_config.input)
-      self.input_features.append(input_feature)
-
-    self.dnn = copy_obj(self._model_config.dnn)
+    if not self.has_backbone:
+      self._highway_features = {}
+      self._highway_num = len(self._model_config.highway)
+      for _id in range(self._highway_num):
+        highway_cfg = self._model_config.highway[_id]
+        highway_feature, _ = self._input_layer(self._feature_dict,
+                                               highway_cfg.input)
+        self._highway_features[highway_cfg.input] = highway_feature
+
+      self.input_features = []
+      if self._model_config.HasField('input'):
+        input_feature, _ = self._input_layer(self._feature_dict,
+                                             self._model_config.input)
+        self.input_features.append(input_feature)
+
+      self.dnn = copy_obj(self._model_config.dnn)
 
     if self._labels is not None:
       if self._model_config.HasField('session_id'):
@@ -79,32 +80,35 @@ def __init__(
       self.sample_id = None
 
   def build_predict_graph(self):
-    for _id in range(self._highway_num):
-      highway_cfg = self._model_config.highway[_id]
-      highway_fea = tf.layers.batch_normalization(
-          self._highway_features[highway_cfg.input],
-          training=self._is_training,
-          trainable=True,
-          name='highway_%s_bn' % highway_cfg.input)
-      highway_fea = highway(
-          highway_fea,
-          highway_cfg.emb_size,
-          activation=gelu,
-          scope='highway_%s' % _id)
-      print('highway_fea: ', highway_fea)
-      self.input_features.append(highway_fea)
-
-    feature = tf.concat(self.input_features, axis=1)
-
-    num_dnn_layer = len(self.dnn.hidden_units)
-    last_hidden = self.dnn.hidden_units.pop()
-    dnn_net = dnn.DNN(self.dnn, self._l2_reg, 'dnn', self._is_training)
-    net_output = dnn_net(feature)
-    tower_emb = tf.layers.dense(
-        inputs=net_output,
-        units=last_hidden,
-        kernel_regularizer=self._l2_reg,
-        name='dnn/dnn_%d' % (num_dnn_layer - 1))
+    if self.has_backbone:
+      tower_emb = self.backbone
+    else:
+      for _id in range(self._highway_num):
+        highway_cfg = self._model_config.highway[_id]
+        highway_fea = tf.layers.batch_normalization(
+            self._highway_features[highway_cfg.input],
+            training=self._is_training,
+            trainable=True,
+            name='highway_%s_bn' % highway_cfg.input)
+        highway_fea = highway(
+            highway_fea,
+            highway_cfg.emb_size,
+            activation=gelu,
+            scope='highway_%s' % _id)
+        print('highway_fea: ', highway_fea)
+        self.input_features.append(highway_fea)
+
+      feature = tf.concat(self.input_features, axis=1)
+
+      num_dnn_layer = len(self.dnn.hidden_units)
+      last_hidden = self.dnn.hidden_units.pop()
+      dnn_net = dnn.DNN(self.dnn, self._l2_reg, 'dnn', self._is_training)
+      net_output = dnn_net(feature)
+      tower_emb = tf.layers.dense(
+          inputs=net_output,
+          units=last_hidden,
+          kernel_regularizer=self._l2_reg,
+          name='dnn/dnn_%d' % (num_dnn_layer - 1))
 
     if self._model_config.output_l2_normalized_emb:
       norm_emb = tf.nn.l2_normalize(tower_emb, axis=-1)
diff --git a/easy_rec/python/model/dcn.py b/easy_rec/python/model/dcn.py
index fcfa7e780..2a460163a 100644
--- a/easy_rec/python/model/dcn.py
+++ b/easy_rec/python/model/dcn.py
@@ -60,7 +60,7 @@ def build_predict_graph(self):
     tower_fea_arr.append(cross_tensor)
     # final tower
     all_fea = tf.concat(tower_fea_arr, axis=1)
-    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(all_fea)
     output = tf.layers.dense(all_fea, self._num_class, name='output')
diff --git a/easy_rec/python/model/deepfm.py b/easy_rec/python/model/deepfm.py
index d1414c050..0ead36e26 100644
--- a/easy_rec/python/model/deepfm.py
+++ b/easy_rec/python/model/deepfm.py
@@ -39,7 +39,7 @@ def __init__(self,
 
   def build_input_layer(self, model_config, feature_configs):
     # overwrite create input_layer to support wide_output_dim
-    has_final = len(model_config.deepfm.final_dnn.hidden_units) > 0
+    has_final = len(model_config.deepfm.final_mlp.hidden_units) > 0
     if not has_final:
       assert model_config.deepfm.wide_output_dim == model_config.num_class
     self._wide_output_dim = model_config.deepfm.wide_output_dim
@@ -60,9 +60,9 @@ def build_predict_graph(self):
     deep_fea = deep_layer(self._deep_features)
 
     # Final
-    if len(self._model_config.final_dnn.hidden_units) > 0:
+    if len(self._model_config.final_mlp.hidden_units) > 0:
       all_fea = tf.concat([wide_fea, fm_fea, deep_fea], axis=1)
-      final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
+      final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
                                 'final_dnn', self._is_training)
       all_fea = final_dnn_layer(all_fea)
       output = tf.layers.dense(
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index fe9a20ef8..cb6c8a802 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -13,7 +13,6 @@
 from easy_rec.python.compat import regularizers
 from easy_rec.python.layers import input_layer
 from easy_rec.python.layers.backbone import Backbone
-from easy_rec.python.layers.sequence_encoder import SequenceEncoder
 from easy_rec.python.utils import constant
 from easy_rec.python.utils import estimator_utils
 from easy_rec.python.utils import restore_filter
diff --git a/easy_rec/python/model/multi_tower.py b/easy_rec/python/model/multi_tower.py
index 5cdd89ba5..cb0aa6233 100644
--- a/easy_rec/python/model/multi_tower.py
+++ b/easy_rec/python/model/multi_tower.py
@@ -52,7 +52,7 @@ def build_predict_graph(self):
       tower_fea_arr.append(tower_fea)
 
     all_fea = tf.concat(tower_fea_arr, axis=1)
-    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(all_fea)
     output = tf.layers.dense(all_fea, self._num_class, name='output')
diff --git a/easy_rec/python/model/multi_tower_bst.py b/easy_rec/python/model/multi_tower_bst.py
index 4cbc9fd29..478d26a6c 100644
--- a/easy_rec/python/model/multi_tower_bst.py
+++ b/easy_rec/python/model/multi_tower_bst.py
@@ -180,7 +180,7 @@ def build_predict_graph(self):
       tower_fea_arr.append(tower_fea)
 
     all_fea = tf.concat(tower_fea_arr, axis=1)
-    final_dnn = dnn.DNN(self._model_config.final_dnn, self._l2_reg, 'final_dnn',
+    final_dnn = dnn.DNN(self._model_config.final_mlp, self._l2_reg, 'final_dnn',
                         self._is_training)
     all_fea = final_dnn(all_fea)
     output = tf.layers.dense(all_fea, self._num_class, name='output')
diff --git a/easy_rec/python/model/multi_tower_din.py b/easy_rec/python/model/multi_tower_din.py
index e586da1cf..7a1356caa 100644
--- a/easy_rec/python/model/multi_tower_din.py
+++ b/easy_rec/python/model/multi_tower_din.py
@@ -120,7 +120,7 @@ def build_predict_graph(self):
       tower_fea_arr.append(tower_fea)
 
     all_fea = tf.concat(tower_fea_arr, axis=1)
-    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(all_fea)
     output = tf.layers.dense(all_fea, self._num_class, name='output')
diff --git a/easy_rec/python/model/multi_tower_recall.py b/easy_rec/python/model/multi_tower_recall.py
index 8f576944e..101ad36cf 100644
--- a/easy_rec/python/model/multi_tower_recall.py
+++ b/easy_rec/python/model/multi_tower_recall.py
@@ -57,7 +57,7 @@ def build_predict_graph(self):
     tower_fea_arr.append(item_tower_emb)
 
     all_fea = tf.concat(tower_fea_arr, axis=-1)
-    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(all_fea)
     output = tf.layers.dense(all_fea, 1, name='output')
diff --git a/easy_rec/python/model/uniter.py b/easy_rec/python/model/uniter.py
index 40dfc8cb1..9479ce639 100644
--- a/easy_rec/python/model/uniter.py
+++ b/easy_rec/python/model/uniter.py
@@ -37,7 +37,7 @@ def __init__(self,
 
   def build_predict_graph(self):
     hidden = self._uniter_layer(self._is_training, l2_reg=self._l2_reg)
-    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(hidden)
 
diff --git a/easy_rec/python/model/wide_and_deep.py b/easy_rec/python/model/wide_and_deep.py
index f841ed049..e0850abe4 100755
--- a/easy_rec/python/model/wide_and_deep.py
+++ b/easy_rec/python/model/wide_and_deep.py
@@ -34,7 +34,7 @@ def __init__(self,
 
   def build_input_layer(self, model_config, feature_configs):
     # overwrite create input_layer to support wide_output_dim
-    has_final = len(model_config.wide_and_deep.final_dnn.hidden_units) > 0
+    has_final = len(model_config.wide_and_deep.final_mlp.hidden_units) > 0
     self._wide_output_dim = model_config.wide_and_deep.wide_output_dim
     if not has_final:
       model_config.wide_and_deep.wide_output_dim = model_config.num_class
@@ -55,11 +55,11 @@ def build_predict_graph(self):
     logging.info('output deep features dimension: %d' %
                  deep_fea.get_shape()[-1])
 
-    has_final = len(self._model_config.final_dnn.hidden_units) > 0
+    has_final = len(self._model_config.final_mlp.hidden_units) > 0
     print('wide_deep has_final_dnn layers = %d' % has_final)
     if has_final:
       all_fea = tf.concat([wide_fea, deep_fea], axis=1)
-      final_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
+      final_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
                             'final_dnn', self._is_training)
       all_fea = final_layer(all_fea)
       output = tf.layers.dense(
@@ -87,7 +87,7 @@ def get_grouped_vars(self):
     Return:
       list of list of variables.
     """
-    assert len(self._model_config.final_dnn.hidden_units) == 0, \
+    assert len(self._model_config.final_mlp.hidden_units) == 0, \
         'if use different optimizers for wide group and deep group, '\
         + ' final_dnn should not be set.'
     wide_vars = []

From e795f009b02883234501d498981e17585acf9456 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Mon, 19 Jun 2023 16:18:07 +0800
Subject: [PATCH 38/54] [feat]: format backbone code, add recurrent and
 sequential layer

---
 easy_rec/python/layers/common_layers.py      | 6 +++++-
 easy_rec/python/layers/keras/mask_net.py     | 9 +++++----
 easy_rec/version.py                          | 2 +-
 examples/configs/masknet_on_movielens.config | 3 +--
 examples/readme.md                           | 2 +-
 5 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index 011efb061..47f5bcb65 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -91,7 +91,11 @@ def __init__(self, config, input_layer, feature_dict):
     self._input_layer = input_layer
     self._feature_dict = feature_dict
 
-  def __call__(self, group, is_training, *args, **kwargs):
+  def __call__(self, group, is_training, **kwargs):
+    with tf.name_scope('input_' + group):
+      return self.call(group, is_training)
+
+  def call(self, group, is_training):
     if self._config.output_seq_and_normal_feature:
       seq_features, target_feature, target_features = self._input_layer(
           self._feature_dict, group, is_combine=False)
diff --git a/easy_rec/python/layers/keras/mask_net.py b/easy_rec/python/layers/keras/mask_net.py
index 2e66beb22..0ba769972 100644
--- a/easy_rec/python/layers/keras/mask_net.py
+++ b/easy_rec/python/layers/keras/mask_net.py
@@ -60,12 +60,13 @@ class MaskNet(tf.keras.layers.Layer):
   Refer: https://arxiv.org/pdf/2102.07619.pdf
   """
 
-  def __init__(self, params, name='mask_net', l2_reg=None, **kwargs):
+  def __init__(self, params, name='mask_net', **kwargs):
     super(MaskNet, self).__init__(name, **kwargs)
     self.config = params.get_pb_config()
     if self.config.HasField('mlp'):
       p = Parameter.make_from_pb(self.config.mlp)
-      self.mlp = MLP(p, name='%s/mlp' % name, l2_reg=l2_reg)
+      p.l2_regularizer = params.l2_regularizer
+      self.mlp = MLP(p, name='%s/mlp' % name)
     else:
       self.mlp = None
 
@@ -75,7 +76,7 @@ def call(self, inputs, training=None, **kwargs):
       for i, block_conf in enumerate(self.config.mask_blocks):
         params = Parameter.make_from_pb(block_conf)
         mask_layer = MaskBlock(
-            params, name='%s/block_%d' % (self.name, i), reuse=self.reuse)
+            params, name='%s/block_%d' % (self.name, i))
         mask_outputs.append(mask_layer((inputs, inputs)))
       all_mask_outputs = tf.concat(mask_outputs, axis=1)
 
@@ -89,7 +90,7 @@ def call(self, inputs, training=None, **kwargs):
       for i, block_conf in enumerate(self.config.mask_blocks):
         params = Parameter.make_from_pb(block_conf)
         mask_layer = MaskBlock(
-            params, name='%s/block_%d' % (self.name, i), reuse=self.reuse)
+            params, name='%s/block_%d' % (self.name, i))
         net = mask_layer((net, inputs))
 
       if self.mlp is not None:
diff --git a/easy_rec/version.py b/easy_rec/version.py
index f70f1bfba..520cefe3d 100644
--- a/easy_rec/version.py
+++ b/easy_rec/version.py
@@ -1,3 +1,3 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.6.3'
+__version__ = '1.0.0'
diff --git a/examples/configs/masknet_on_movielens.config b/examples/configs/masknet_on_movielens.config
index c98e3fbd0..dccbbb13e 100644
--- a/examples/configs/masknet_on_movielens.config
+++ b/examples/configs/masknet_on_movielens.config
@@ -17,9 +17,8 @@ train_config {
     }
     use_moving_average: false
   }
-  save_checkpoints_steps: 100
+  save_checkpoints_steps: 2000
   sync_replicas: True
-  num_steps: 2500
 }
 
 eval_config {
diff --git a/examples/readme.md b/examples/readme.md
index ba4f57cce..cbf9be600 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -220,7 +220,7 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
   | DCN (Backbone)      | 1     | 0.8770 |
   | AutoInt             | 1     | 0.8513 |
   | MaskNet             | 1     | 0.8872 |
-  | FibiNet             | 1     | 0.8879 |
+  | FibiNet             | 1     | 0.8893 |
 
   备注：`MovieLens-1M` 数据集较小，评估指标方差较大，以上结果仅供参考。
 

From c4f5ea946a2aa4d4c4a8e617febcd39716232178 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 20 Jun 2023 13:20:49 +0800
Subject: [PATCH 39/54] [feat]: format backbone code, add recurrent and
 sequential layer

---
 easy_rec/python/layers/backbone.py            | 25 ++++-
 easy_rec/python/layers/common_layers.py       |  2 +-
 easy_rec/python/layers/keras/blocks.py        | 10 +-
 easy_rec/python/protos/backbone.proto         |  9 ++
 easy_rec/python/utils/config_util.py          | 93 +++++++++++++++++++
 ...pfm_backbone_on_criteo_with_autodis.config | 31 ++++---
 ...fm_backbone_on_criteo_with_periodic.config | 27 ++++--
 7 files changed, 170 insertions(+), 27 deletions(-)

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 22645bee0..cfc0e3d60 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -129,20 +129,22 @@ def __call__(self, is_training, **kwargs):
         input_fn = EnhancedInputLayer(conf, self._input_layer, self._features)
         output = input_fn(block, is_training)
         block_outputs[block] = output
-      elif layer == 'sequential':
-        print(config)
       else:
         inputs = block_input(config, block_outputs)
         output = self.call_layer(inputs, config, block, is_training)
         block_outputs[block] = output
 
-    temp = []
+    outputs = []
     for output in self._config.concat_blocks:
       if output in block_outputs:
-        temp.append(block_outputs[output])
+        temp = block_outputs[output]
+        if type(temp) in (tuple, list):
+          outputs.extend(temp)
+        else:
+          outputs.append(temp)
       else:
         raise ValueError('No output `%s` of backbone to be concat' % output)
-    output = concat_inputs(temp, msg='backbone')
+    output = concat_inputs(outputs, msg='backbone')
 
     if self._config.HasField('top_mlp'):
       params = Parameter.make_from_pb(self._config.top_mlp)
@@ -193,6 +195,19 @@ def call_layer(self, inputs, config, name, training):
       conf = getattr(config, 'lambda')
       fn = eval(conf.expression)
       return fn(inputs)
+    if layer_name == 'repeat':
+      conf = config.repeat
+      n_loop = conf.num_repeat
+      outputs = []
+      for i in range(n_loop):
+        name_i = '%s_%d' % (name, i)
+        output = self.call_keras_layer(conf.keras_layer, inputs, name_i, training)
+        outputs.append(output)
+      if len(outputs) == 1:
+        return outputs[0]
+      if conf.HasField('output_concat_axis'):
+        return tf.concat(outputs, conf.output_concat_axis)
+      return outputs
     if layer_name == 'recurrent':
       conf = config.recurrent
       fixed_input_index = -1
diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index 47f5bcb65..dd39d8259 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -109,7 +109,7 @@ def call(self, group, is_training):
     do_feature_dropout = is_training and 0.0 < self._config.feature_dropout_rate < 1.0
     if do_feature_dropout:
       keep_prob = 1.0 - self._config.feature_dropout_rate
-      bern = tf.distributions.Bernoulli(probs=keep_prob)
+      bern = tf.distributions.Bernoulli(probs=keep_prob, dtype=tf.float32)
       mask = bern.sample(num_features)
     elif do_bn:
       features = tf.layers.batch_normalization(features, training=is_training)
diff --git a/easy_rec/python/layers/keras/blocks.py b/easy_rec/python/layers/keras/blocks.py
index 2c7f08403..5c14a07c3 100644
--- a/easy_rec/python/layers/keras/blocks.py
+++ b/easy_rec/python/layers/keras/blocks.py
@@ -101,8 +101,16 @@ def add_rich_layer(self,
 
   def call(self, x, training=None, **kwargs):
     """Performs the forward computation of the block."""
+    from inspect import isfunction
     for layer in self._sub_layers:
-      x = layer(x, training=training)
+      if isfunction(layer):
+        x = layer(x, training=training)
+      else:
+        cls = layer.__class__.__name__
+        if cls in ('Dropout', 'BatchNormalization'):
+          x = layer(x, training=training)
+        else:
+          x = layer(x)
     return x
 
 
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
index 6f292a48d..5e2262707 100644
--- a/easy_rec/python/protos/backbone.proto
+++ b/easy_rec/python/protos/backbone.proto
@@ -30,11 +30,19 @@ message RecurrentLayer {
     required KerasLayer keras_layer = 3;
 }
 
+message RepeatLayer {
+    required uint32 num_repeat = 1 [default = 1];
+    // default output the list of multiple outputs
+    optional int32 output_concat_axis = 2;
+    required KerasLayer keras_layer = 3;
+}
+
 message Layer {
     oneof layer {
         Lambda lambda = 1;
         KerasLayer keras_layer = 2;
         RecurrentLayer recurrent = 3;
+        RepeatLayer repeat = 4;
     }
 }
 
@@ -54,6 +62,7 @@ message Block {
         Lambda lambda = 102;
         KerasLayer keras_layer = 103;
         RecurrentLayer recurrent = 104;
+        RepeatLayer repeat = 105;
     }
 }
 
diff --git a/easy_rec/python/utils/config_util.py b/easy_rec/python/utils/config_util.py
index b63a02f71..67f3bc351 100644
--- a/easy_rec/python/utils/config_util.py
+++ b/easy_rec/python/utils/config_util.py
@@ -5,6 +5,7 @@
 Such as Hyper parameter tuning or automatic feature expanding.
 """
 
+import argparse
 import datetime
 import json
 import logging
@@ -605,3 +606,95 @@ def process_multi_file_input_path(sampler_config_input_path):
     input_path = sampler_config_input_path
 
   return input_path
+
+
+def change_configured_embedding_dim(pipeline_config_path, groups, emb_dim):
+  """Reads config from a file containing pipeline_pb2.EasyRecConfig.
+
+  Args:
+    pipeline_config_path: Path to pipeline_pb2.EasyRecConfig text
+      proto.
+    groups: the names of feature group to be changed
+    emb_dim: target embedding dimension
+
+  Returns:
+    Dictionary of configuration objects. Keys are `model`, `train_config`,
+      `train_input_config`, `eval_config`, `eval_input_config`. Value are the
+      corresponding config objects.
+  """
+  if isinstance(pipeline_config_path, pipeline_pb2.EasyRecConfig):
+    return pipeline_config_path
+
+  assert tf.gfile.Exists(
+      pipeline_config_path
+  ), 'pipeline_config_path [%s] not exists' % pipeline_config_path
+
+  pipeline_config = pipeline_pb2.EasyRecConfig()
+  with tf.gfile.GFile(pipeline_config_path, 'r') as f:
+    config_str = f.read()
+    if pipeline_config_path.endswith('.config'):
+      text_format.Merge(config_str, pipeline_config)
+    elif pipeline_config_path.endswith('.json'):
+      json_format.Parse(config_str, pipeline_config)
+    else:
+      assert False, 'invalid file format(%s), currently support formats: .config(prototxt) .json' % pipeline_config_path
+
+  target_groups = set(groups.split(','))
+  features = set()
+  conf = pipeline_config.model_config
+  for group in conf.feature_groups:
+    if group.group_name not in target_groups:
+      continue
+    for feature in group.feature_names:
+      features.add(feature)
+
+  feature_configs = get_compatible_feature_configs(pipeline_config)
+  for fea_conf in feature_configs:
+    fea_name = fea_conf.input_names[0]
+    if fea_conf.HasField('feature_name'):
+      fea_name = fea_conf.feature_name
+    if fea_name in features:
+      fea_conf.embedding_dim = emb_dim
+
+  return pipeline_config
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+    '--pipeline_config_path',
+    type=str,
+    default=None,
+    help='Path to pipeline config file.')
+  parser.add_argument(
+    '--feature_groups',
+    type=str,
+    default=None,
+    help='The name of feature group to be changed.')
+  parser.add_argument(
+    '--embedding_dim',
+    type=int,
+    default=None,
+    help='The embedding dim to be changed to.')
+  parser.add_argument(
+    '--save_config_path',
+    type=str,
+    default=None,
+    help='Path to save changed config.')
+
+  args, extra_args = parser.parse_known_args()
+  if args.pipeline_config_path is None:
+    raise ValueError('--pipeline_config_path must be set')
+  if args.save_config_path is None:
+    raise ValueError('--save_config_path must be set')
+  if args.feature_groups is None:
+    raise ValueError('--feature_groups must be set')
+  if args.embedding_dim is None:
+    raise ValueError('--embedding_dim must be set')
+
+  # 传入一个不存在的feature group，可以起到format配置文件的效果
+  config = change_configured_embedding_dim(
+    args.pipeline_config_path,
+    args.feature_groups,
+    args.embedding_dim)
+  save_message(config, args.save_config_path)
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
index 970508598..49fcf8e38 100644
--- a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
+++ b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
@@ -674,7 +674,7 @@ model_config: {
       inputs {
         name: 'wide_features'
       }
-      Lambda {
+      lambda {
         expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
       }
     }
@@ -683,11 +683,14 @@ model_config: {
       inputs {
         name: 'numerical_features'
       }
-      auto_dis_embedding {
-        embedding_dim: 16
-        num_bins: 20
-        temperature: 0.815
-        output_tensor_list: true
+      keras_layer {
+        class_name: 'AutoDisEmbedding'
+        auto_dis_embedding {
+          embedding_dim: 16
+          num_bins: 20
+          temperature: 0.815
+          output_tensor_list: true
+        }
       }
     }
     blocks {
@@ -706,8 +709,11 @@ model_config: {
         name: 'num_emb'
         input_fn: 'lambda x: x[1]'
       }
-      fm {
-        use_variant: true
+      keras_layer {
+        class_name: 'FM'
+        fm {
+          use_variant: true
+        }
       }
     }
     blocks {
@@ -720,11 +726,14 @@ model_config: {
         name: 'num_emb'
         input_fn: 'lambda x: x[0]'
       }
-      mlp {
-        hidden_units: [256, 128, 64]
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64]
+        }
       }
     }
-    // no wide_logit may have better performance
+    # no wide_logit may have better performance
     concat_blocks: ['wide_logit', 'fm', 'deep']
     top_mlp {
       hidden_units: [256, 128, 64]
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_periodic.config b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
index 82dd01998..2f2f8435b 100644
--- a/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
+++ b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
@@ -674,7 +674,7 @@ model_config: {
       inputs {
         name: 'wide_features'
       }
-      Lambda {
+      lambda {
         expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
       }
     }
@@ -683,10 +683,13 @@ model_config: {
       inputs {
         name: 'numerical_features'
       }
-      periodic_embedding {
-        embedding_dim: 16
-        sigma: 0.005
-        output_tensor_list: true
+      keras_layer {
+        class_name: 'PeriodicEmbedding'
+        periodic_embedding {
+          embedding_dim: 16
+          sigma: 0.005
+          output_tensor_list: true
+        }
       }
     }
     blocks {
@@ -705,8 +708,11 @@ model_config: {
         name: 'num_emb'
         input_fn: 'lambda x: x[1]'
       }
-      fm {
-        use_variant: true
+      keras_layer {
+        class_name: 'FM'
+        fm {
+          use_variant: true
+        }
       }
     }
     blocks {
@@ -719,8 +725,11 @@ model_config: {
         name: 'num_emb'
         input_fn: 'lambda x: x[0]'
       }
-      mlp {
-        hidden_units: [256, 128, 64]
+      keras_layer {
+        class_name: 'MLP'
+        mlp {
+          hidden_units: [256, 128, 64]
+        }
       }
     }
     concat_blocks: ['wide_logit', 'fm', 'deep']

From 1b504a8df374dcb493ca5e9bdf9f1d6df057fbd0 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 20 Jun 2023 15:11:38 +0800
Subject: [PATCH 40/54] [feat]: add repeat block

---
 easy_rec/python/model/cmbf.py               | 2 +-
 easy_rec/python/model/dcn.py                | 2 +-
 easy_rec/python/model/deepfm.py             | 6 +++---
 easy_rec/python/model/multi_tower.py        | 2 +-
 easy_rec/python/model/multi_tower_bst.py    | 2 +-
 easy_rec/python/model/multi_tower_din.py    | 2 +-
 easy_rec/python/model/multi_tower_recall.py | 2 +-
 easy_rec/python/model/uniter.py             | 2 +-
 easy_rec/python/model/wide_and_deep.py      | 8 ++++----
 9 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/easy_rec/python/model/cmbf.py b/easy_rec/python/model/cmbf.py
index a11a30582..0f0a8f3aa 100644
--- a/easy_rec/python/model/cmbf.py
+++ b/easy_rec/python/model/cmbf.py
@@ -38,7 +38,7 @@ def __init__(self,
 
   def build_predict_graph(self):
     hidden = self._cmbf_layer(self._is_training, l2_reg=self._l2_reg)
-    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(hidden)
 
diff --git a/easy_rec/python/model/dcn.py b/easy_rec/python/model/dcn.py
index 2a460163a..fcfa7e780 100644
--- a/easy_rec/python/model/dcn.py
+++ b/easy_rec/python/model/dcn.py
@@ -60,7 +60,7 @@ def build_predict_graph(self):
     tower_fea_arr.append(cross_tensor)
     # final tower
     all_fea = tf.concat(tower_fea_arr, axis=1)
-    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(all_fea)
     output = tf.layers.dense(all_fea, self._num_class, name='output')
diff --git a/easy_rec/python/model/deepfm.py b/easy_rec/python/model/deepfm.py
index 0ead36e26..d1414c050 100644
--- a/easy_rec/python/model/deepfm.py
+++ b/easy_rec/python/model/deepfm.py
@@ -39,7 +39,7 @@ def __init__(self,
 
   def build_input_layer(self, model_config, feature_configs):
     # overwrite create input_layer to support wide_output_dim
-    has_final = len(model_config.deepfm.final_mlp.hidden_units) > 0
+    has_final = len(model_config.deepfm.final_dnn.hidden_units) > 0
     if not has_final:
       assert model_config.deepfm.wide_output_dim == model_config.num_class
     self._wide_output_dim = model_config.deepfm.wide_output_dim
@@ -60,9 +60,9 @@ def build_predict_graph(self):
     deep_fea = deep_layer(self._deep_features)
 
     # Final
-    if len(self._model_config.final_mlp.hidden_units) > 0:
+    if len(self._model_config.final_dnn.hidden_units) > 0:
       all_fea = tf.concat([wide_fea, fm_fea, deep_fea], axis=1)
-      final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
+      final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
                                 'final_dnn', self._is_training)
       all_fea = final_dnn_layer(all_fea)
       output = tf.layers.dense(
diff --git a/easy_rec/python/model/multi_tower.py b/easy_rec/python/model/multi_tower.py
index cb0aa6233..5cdd89ba5 100644
--- a/easy_rec/python/model/multi_tower.py
+++ b/easy_rec/python/model/multi_tower.py
@@ -52,7 +52,7 @@ def build_predict_graph(self):
       tower_fea_arr.append(tower_fea)
 
     all_fea = tf.concat(tower_fea_arr, axis=1)
-    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(all_fea)
     output = tf.layers.dense(all_fea, self._num_class, name='output')
diff --git a/easy_rec/python/model/multi_tower_bst.py b/easy_rec/python/model/multi_tower_bst.py
index 478d26a6c..4cbc9fd29 100644
--- a/easy_rec/python/model/multi_tower_bst.py
+++ b/easy_rec/python/model/multi_tower_bst.py
@@ -180,7 +180,7 @@ def build_predict_graph(self):
       tower_fea_arr.append(tower_fea)
 
     all_fea = tf.concat(tower_fea_arr, axis=1)
-    final_dnn = dnn.DNN(self._model_config.final_mlp, self._l2_reg, 'final_dnn',
+    final_dnn = dnn.DNN(self._model_config.final_dnn, self._l2_reg, 'final_dnn',
                         self._is_training)
     all_fea = final_dnn(all_fea)
     output = tf.layers.dense(all_fea, self._num_class, name='output')
diff --git a/easy_rec/python/model/multi_tower_din.py b/easy_rec/python/model/multi_tower_din.py
index 7a1356caa..e586da1cf 100644
--- a/easy_rec/python/model/multi_tower_din.py
+++ b/easy_rec/python/model/multi_tower_din.py
@@ -120,7 +120,7 @@ def build_predict_graph(self):
       tower_fea_arr.append(tower_fea)
 
     all_fea = tf.concat(tower_fea_arr, axis=1)
-    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(all_fea)
     output = tf.layers.dense(all_fea, self._num_class, name='output')
diff --git a/easy_rec/python/model/multi_tower_recall.py b/easy_rec/python/model/multi_tower_recall.py
index 101ad36cf..8f576944e 100644
--- a/easy_rec/python/model/multi_tower_recall.py
+++ b/easy_rec/python/model/multi_tower_recall.py
@@ -57,7 +57,7 @@ def build_predict_graph(self):
     tower_fea_arr.append(item_tower_emb)
 
     all_fea = tf.concat(tower_fea_arr, axis=-1)
-    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(all_fea)
     output = tf.layers.dense(all_fea, 1, name='output')
diff --git a/easy_rec/python/model/uniter.py b/easy_rec/python/model/uniter.py
index 9479ce639..40dfc8cb1 100644
--- a/easy_rec/python/model/uniter.py
+++ b/easy_rec/python/model/uniter.py
@@ -37,7 +37,7 @@ def __init__(self,
 
   def build_predict_graph(self):
     hidden = self._uniter_layer(self._is_training, l2_reg=self._l2_reg)
-    final_dnn_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
+    final_dnn_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
                               'final_dnn', self._is_training)
     all_fea = final_dnn_layer(hidden)
 
diff --git a/easy_rec/python/model/wide_and_deep.py b/easy_rec/python/model/wide_and_deep.py
index e0850abe4..f841ed049 100755
--- a/easy_rec/python/model/wide_and_deep.py
+++ b/easy_rec/python/model/wide_and_deep.py
@@ -34,7 +34,7 @@ def __init__(self,
 
   def build_input_layer(self, model_config, feature_configs):
     # overwrite create input_layer to support wide_output_dim
-    has_final = len(model_config.wide_and_deep.final_mlp.hidden_units) > 0
+    has_final = len(model_config.wide_and_deep.final_dnn.hidden_units) > 0
     self._wide_output_dim = model_config.wide_and_deep.wide_output_dim
     if not has_final:
       model_config.wide_and_deep.wide_output_dim = model_config.num_class
@@ -55,11 +55,11 @@ def build_predict_graph(self):
     logging.info('output deep features dimension: %d' %
                  deep_fea.get_shape()[-1])
 
-    has_final = len(self._model_config.final_mlp.hidden_units) > 0
+    has_final = len(self._model_config.final_dnn.hidden_units) > 0
     print('wide_deep has_final_dnn layers = %d' % has_final)
     if has_final:
       all_fea = tf.concat([wide_fea, deep_fea], axis=1)
-      final_layer = dnn.DNN(self._model_config.final_mlp, self._l2_reg,
+      final_layer = dnn.DNN(self._model_config.final_dnn, self._l2_reg,
                             'final_dnn', self._is_training)
       all_fea = final_layer(all_fea)
       output = tf.layers.dense(
@@ -87,7 +87,7 @@ def get_grouped_vars(self):
     Return:
       list of list of variables.
     """
-    assert len(self._model_config.final_mlp.hidden_units) == 0, \
+    assert len(self._model_config.final_dnn.hidden_units) == 0, \
         'if use different optimizers for wide group and deep group, '\
         + ' final_dnn should not be set.'
     wide_vars = []

From 32ff01cad4cf14ea0e936d1f6e7e481201712474 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 20 Jun 2023 16:59:25 +0800
Subject: [PATCH 41/54] fix bug of no is_predicting argument

---
 easy_rec/python/model/easy_rec_model.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index cb6c8a802..8920f06dc 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -33,12 +33,11 @@ def __init__(self,
                feature_configs,
                features,
                labels=None,
-               is_training=False,
-               is_predicting=False):
+               is_training=False):
     self._base_model_config = model_config
     self._model_config = model_config
     self._is_training = is_training
-    self._is_predicting = is_predicting
+    self._is_predicting = labels is None
     self._feature_dict = features
 
     # embedding variable parameters

From 0c087d94b7f31e9eb8d46c402ef647de16c49007 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 20 Jun 2023 17:30:38 +0800
Subject: [PATCH 42/54] fix bug of no is_predicting argument

---
 .../compat/feature_column/feature_column.py   |  6 +--
 easy_rec/python/layers/backbone.py            |  3 +-
 easy_rec/python/layers/keras/blocks.py        | 14 ++++---
 easy_rec/python/layers/keras/mask_net.py      |  6 +--
 easy_rec/python/model/easy_rec_estimator.py   |  3 +-
 easy_rec/python/utils/config_util.py          | 39 +++++++++----------
 6 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/easy_rec/python/compat/feature_column/feature_column.py b/easy_rec/python/compat/feature_column/feature_column.py
index 27557e9a7..d446adb76 100644
--- a/easy_rec/python/compat/feature_column/feature_column.py
+++ b/easy_rec/python/compat/feature_column/feature_column.py
@@ -2541,9 +2541,9 @@ def raw_name(self):
   @property
   def cardinality(self):
     from easy_rec.python.compat.feature_column.feature_column_v2 import HashedCategoricalColumn, \
-      BucketizedColumn, WeightedCategoricalColumn, SequenceWeightedCategoricalColumn, \
-      CrossedColumn, IdentityCategoricalColumn, VocabularyListCategoricalColumn, \
-      VocabularyFileCategoricalColumn
+        BucketizedColumn, WeightedCategoricalColumn, SequenceWeightedCategoricalColumn, \
+        CrossedColumn, IdentityCategoricalColumn, VocabularyListCategoricalColumn, \
+        VocabularyFileCategoricalColumn
 
     fc = self.categorical_column
     if isinstance(fc, HashedCategoricalColumn) or isinstance(fc, CrossedColumn):
diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index cfc0e3d60..b673a209a 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -201,7 +201,8 @@ def call_layer(self, inputs, config, name, training):
       outputs = []
       for i in range(n_loop):
         name_i = '%s_%d' % (name, i)
-        output = self.call_keras_layer(conf.keras_layer, inputs, name_i, training)
+        output = self.call_keras_layer(conf.keras_layer, inputs, name_i,
+                                       training)
         outputs.append(output)
       if len(outputs) == 1:
         return outputs[0]
diff --git a/easy_rec/python/layers/keras/blocks.py b/easy_rec/python/layers/keras/blocks.py
index 5c14a07c3..62063e451 100644
--- a/easy_rec/python/layers/keras/blocks.py
+++ b/easy_rec/python/layers/keras/blocks.py
@@ -63,6 +63,11 @@ def add_rich_layer(self,
                      use_bn_after_activation=False,
                      name='mlp',
                      l2_reg=None):
+
+    def batch_norm(x, training):
+      return tf.layers.batch_normalization(
+          x, training=training, name='%s/%s/bn' % (self.name, name))
+
     act_fn = get_activation(activation)
     if use_bn and not use_bn_after_activation:
       dense = tf.keras.layers.Dense(
@@ -72,11 +77,10 @@ def add_rich_layer(self,
           kernel_regularizer=l2_reg,
           name=name)
       self._sub_layers.append(dense)
+
       # bn = tf.keras.layers.BatchNormalization(name='%s/bn' % name)
       # keras BN layer have a stale issue on some versions of tf
-      bn = lambda x, training: tf.layers.batch_normalization(
-          x, training=training, name='%s/%s/bn' % (self.name, name))
-      self._sub_layers.append(bn)
+      self._sub_layers.append(batch_norm)
       act = tf.keras.layers.Activation(act_fn, name='%s/act' % name)
       self._sub_layers.append(act)
     else:
@@ -89,9 +93,7 @@ def add_rich_layer(self,
           name=name)
       self._sub_layers.append(dense)
       if use_bn and use_bn_after_activation:
-        bn = lambda x, training: tf.layers.batch_normalization(
-            x, training=training, name='%s/%s/bn' % (self.name, name))
-        self._sub_layers.append(bn)
+        self._sub_layers.append(batch_norm)
 
     if 0.0 < dropout_rate < 1.0:
       dropout = tf.keras.layers.Dropout(dropout_rate, name='%s/dropout' % name)
diff --git a/easy_rec/python/layers/keras/mask_net.py b/easy_rec/python/layers/keras/mask_net.py
index 0ba769972..ca939bb7e 100644
--- a/easy_rec/python/layers/keras/mask_net.py
+++ b/easy_rec/python/layers/keras/mask_net.py
@@ -75,8 +75,7 @@ def call(self, inputs, training=None, **kwargs):
       mask_outputs = []
       for i, block_conf in enumerate(self.config.mask_blocks):
         params = Parameter.make_from_pb(block_conf)
-        mask_layer = MaskBlock(
-            params, name='%s/block_%d' % (self.name, i))
+        mask_layer = MaskBlock(params, name='%s/block_%d' % (self.name, i))
         mask_outputs.append(mask_layer((inputs, inputs)))
       all_mask_outputs = tf.concat(mask_outputs, axis=1)
 
@@ -89,8 +88,7 @@ def call(self, inputs, training=None, **kwargs):
       net = inputs
       for i, block_conf in enumerate(self.config.mask_blocks):
         params = Parameter.make_from_pb(block_conf)
-        mask_layer = MaskBlock(
-            params, name='%s/block_%d' % (self.name, i))
+        mask_layer = MaskBlock(params, name='%s/block_%d' % (self.name, i))
         net = mask_layer((net, inputs))
 
       if self.mlp is not None:
diff --git a/easy_rec/python/model/easy_rec_estimator.py b/easy_rec/python/model/easy_rec_estimator.py
index 9cbd28b6c..51ecad09f 100644
--- a/easy_rec/python/model/easy_rec_estimator.py
+++ b/easy_rec/python/model/easy_rec_estimator.py
@@ -514,8 +514,7 @@ def _export_model_fn(self, features, labels, run_config, params):
         self.feature_configs,
         features,
         labels=None,
-        is_training=False,
-        is_predicting=True)
+        is_training=False)
     model.build_predict_graph()
 
     export_config = self._pipeline_config.export_config
diff --git a/easy_rec/python/utils/config_util.py b/easy_rec/python/utils/config_util.py
index 67f3bc351..72c050775 100644
--- a/easy_rec/python/utils/config_util.py
+++ b/easy_rec/python/utils/config_util.py
@@ -662,25 +662,25 @@ def change_configured_embedding_dim(pipeline_config_path, groups, emb_dim):
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
-    '--pipeline_config_path',
-    type=str,
-    default=None,
-    help='Path to pipeline config file.')
+      '--pipeline_config_path',
+      type=str,
+      default=None,
+      help='Path to pipeline config file.')
   parser.add_argument(
-    '--feature_groups',
-    type=str,
-    default=None,
-    help='The name of feature group to be changed.')
+      '--feature_groups',
+      type=str,
+      default=None,
+      help='The name of feature group to be changed.')
   parser.add_argument(
-    '--embedding_dim',
-    type=int,
-    default=None,
-    help='The embedding dim to be changed to.')
+      '--embedding_dim',
+      type=int,
+      default=None,
+      help='The embedding dim to be changed to.')
   parser.add_argument(
-    '--save_config_path',
-    type=str,
-    default=None,
-    help='Path to save changed config.')
+      '--save_config_path',
+      type=str,
+      default=None,
+      help='Path to save changed config.')
 
   args, extra_args = parser.parse_known_args()
   if args.pipeline_config_path is None:
@@ -693,8 +693,7 @@ def change_configured_embedding_dim(pipeline_config_path, groups, emb_dim):
     raise ValueError('--embedding_dim must be set')
 
   # 传入一个不存在的feature group，可以起到format配置文件的效果
-  config = change_configured_embedding_dim(
-    args.pipeline_config_path,
-    args.feature_groups,
-    args.embedding_dim)
+  config = change_configured_embedding_dim(args.pipeline_config_path,
+                                           args.feature_groups,
+                                           args.embedding_dim)
   save_message(config, args.save_config_path)

From af871b36f236a7eef48e91d17029b823f9e624bf Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Tue, 20 Jun 2023 19:43:23 +0800
Subject: [PATCH 43/54] fix deepfm distribute eval test case

---
 easy_rec/python/model/mind.py                 |   2 +-
 easy_rec/python/test/train_eval_test.py       |  20 +-
 easy_rec/python/utils/io_util.py              |   2 +-
 ...equence_feature_aux_hist_seq_taobao.config | 292 ------------------
 .../deepfm_on_sequence_feature_taobao.config  | 291 -----------------
 .../fm_on_sequence_feature_taobao.config      | 288 -----------------
 6 files changed, 6 insertions(+), 889 deletions(-)
 delete mode 100644 samples/model_config/deepfm_on_sequence_feature_aux_hist_seq_taobao.config
 delete mode 100644 samples/model_config/deepfm_on_sequence_feature_taobao.config
 delete mode 100644 samples/model_config/fm_on_sequence_feature_taobao.config

diff --git a/easy_rec/python/model/mind.py b/easy_rec/python/model/mind.py
index c414703d2..270060297 100644
--- a/easy_rec/python/model/mind.py
+++ b/easy_rec/python/model/mind.py
@@ -32,7 +32,7 @@ def __init__(self,
         'invalid model config: %s' % self._model_config.WhichOneof('model')
     self._model_config = self._model_config.mind
 
-    self._hist_seq_features = self._input_layer(
+    self._hist_seq_features, _, _ = self._input_layer(
         self._feature_dict, 'hist', is_combine=False)
     self._user_features, _ = self._input_layer(self._feature_dict, 'user')
     self._item_features, _ = self._input_layer(self._feature_dict, 'item')
diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py
index cbdf95dd2..8f0f25aa1 100644
--- a/easy_rec/python/test/train_eval_test.py
+++ b/easy_rec/python/test/train_eval_test.py
@@ -306,10 +306,10 @@ def test_bst(self):
         'samples/model_config/bst_on_taobao.config', self._test_dir)
     self.assertTrue(self._success)
 
-  def test_bst_contrastive_learning(self):
-    self._success = test_utils.test_single_train_eval(
-        'samples/model_config/bst_cl_on_taobao.config', self._test_dir)
-    self.assertTrue(self._success)
+  # def test_bst_contrastive_learning(self):
+  #   self._success = test_utils.test_single_train_eval(
+  #       'samples/model_config/bst_cl_on_taobao.config', self._test_dir)
+  #   self.assertTrue(self._success)
 
   def test_dcn(self):
     self._success = test_utils.test_single_train_eval(
@@ -800,12 +800,6 @@ def test_sequence_esmm(self):
         self._test_dir)
     self.assertTrue(self._success)
 
-  def test_sequence_fm(self):
-    self._success = test_utils.test_single_train_eval(
-        'samples/model_config/fm_on_sequence_feature_taobao.config',
-        self._test_dir)
-    self.assertTrue(self._success)
-
   def test_sequence_mmoe(self):
     self._success = test_utils.test_single_train_eval(
         'samples/model_config/mmoe_on_sequence_feature_taobao.config',
@@ -1036,12 +1030,6 @@ def test_dbmtl_on_multi_numeric_boundary_aux_hist_seq(self):
         self._test_dir)
     self.assertTrue(self._success)
 
-  def test_deepfm_on_sequence_feature_aux_hist_seq(self):
-    self._success = test_utils.test_single_train_eval(
-        'samples/model_config/deepfm_on_sequence_feature_aux_hist_seq_taobao.config',
-        self._test_dir)
-    self.assertTrue(self._success)
-
   @unittest.skipIf(gl is None, 'graphlearn is not installed')
   def test_multi_tower_recall_neg_sampler_sequence_feature(self):
     self._success = test_utils.test_single_train_eval(
diff --git a/easy_rec/python/utils/io_util.py b/easy_rec/python/utils/io_util.py
index 4c1c28550..091e10e07 100644
--- a/easy_rec/python/utils/io_util.py
+++ b/easy_rec/python/utils/io_util.py
@@ -97,7 +97,7 @@ def download(oss_or_url, dst_dir=''):
 def create_module_dir(dst_dir):
   if not os.path.exists(dst_dir):
     os.makedirs(dst_dir)
-    with open(os.path.join(dst_dir, 'explainer.py'), 'w') as ofile:
+    with open(os.path.join(dst_dir, '__init__.py'), 'w') as ofile:
       ofile.write('\n')
 
 
diff --git a/samples/model_config/deepfm_on_sequence_feature_aux_hist_seq_taobao.config b/samples/model_config/deepfm_on_sequence_feature_aux_hist_seq_taobao.config
deleted file mode 100644
index a663d2f03..000000000
--- a/samples/model_config/deepfm_on_sequence_feature_aux_hist_seq_taobao.config
+++ /dev/null
@@ -1,292 +0,0 @@
-train_input_path: "data/test/tb_data/taobao_train_data"
-eval_input_path: "data/test/tb_data/taobao_test_data"
-model_dir: "experiments/deepfm_on_taobao_ckpt"
-
-train_config {
-  log_step_count_steps: 100
-  optimizer_config: {
-    adam_optimizer: {
-      learning_rate: {
-        exponential_decay_learning_rate {
-          initial_learning_rate: 0.001
-          decay_steps: 1000
-          decay_factor: 0.5
-          min_learning_rate: 0.00001
-        }
-      }
-    }
-    use_moving_average: false
-  }
-  save_checkpoints_steps: 100
-  sync_replicas: True
-  num_steps: 1000
-}
-
-eval_config {
-  metrics_set: {
-    auc {}
-  }
-}
-
-data_config {
-  input_fields {
-    input_name:'clk'
-    input_type: INT32
-  }
-  input_fields {
-    input_name:'buy'
-    input_type: INT32
-  }
-  input_fields {
-    input_name: 'pid'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'adgroup_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'cate_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'campaign_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'customer'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'brand'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'user_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'cms_segid'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'cms_group_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'final_gender_code'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'age_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'pvalue_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'shopping_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'occupation'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'new_user_class_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'tag_category_list'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'tag_brand_list'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'price'
-    input_type: INT32
-  }
-
-  label_fields: 'clk'
-  batch_size: 4096
-  num_epochs: 10000
-  prefetch_size: 32
-  input_type: CSVInput
-}
-
-feature_configs : {
-  input_names: 'pid'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'adgroup_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'cate_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10000
-}
-feature_configs : {
-  input_names: 'campaign_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'customer'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'brand'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'user_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'cms_segid'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100
-}
-feature_configs : {
-  input_names: 'cms_group_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100
-}
-feature_configs : {
-  input_names: 'final_gender_code'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'age_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'pvalue_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'shopping_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'occupation'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'new_user_class_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-   input_names: 'tag_category_list'
-   feature_type: SequenceFeature
-   separator: '|'
-   hash_bucket_size: 10000
-   embedding_dim: 16
-}
-feature_configs : {
-   input_names: 'tag_brand_list'
-   feature_type: SequenceFeature
-   separator: '|'
-   hash_bucket_size: 100000
-   embedding_dim: 16
-}
-feature_configs : {
-  input_names: 'price'
-  feature_type: IdFeature
-  embedding_dim: 16
-  num_buckets: 50
-}
-
-model_config: {
-  model_class: 'DeepFM'
-  feature_groups: {
-    group_name: 'wide'
-    feature_names: 'user_id'
-    feature_names: 'cms_segid'
-    feature_names: 'cms_group_id'
-    feature_names: 'age_level'
-    feature_names: 'pvalue_level'
-    feature_names: 'shopping_level'
-    feature_names: 'occupation'
-    feature_names: 'new_user_class_level'
-    feature_names: 'adgroup_id'
-    feature_names: 'cate_id'
-    feature_names: 'campaign_id'
-    feature_names: 'customer'
-    feature_names: 'brand'
-    feature_names: 'price'
-    feature_names: 'pid'
-    wide_deep: WIDE
-  }
-  feature_groups: {
-    group_name: 'deep'
-    feature_names: 'user_id'
-    feature_names: 'cms_segid'
-    feature_names: 'cms_group_id'
-    feature_names: 'age_level'
-    feature_names: 'pvalue_level'
-    feature_names: 'shopping_level'
-    feature_names: 'occupation'
-    feature_names: 'new_user_class_level'
-    feature_names: 'adgroup_id'
-    feature_names: 'cate_id'
-    feature_names: 'campaign_id'
-    feature_names: 'customer'
-    feature_names: 'brand'
-    feature_names: 'price'
-    feature_names: 'pid'
-    wide_deep: DEEP
-    sequence_features: {
-    group_name: "seq_fea"
-    tf_summary: false
-    allow_key_transform:true
-    seq_att_map: {
-       key: "brand"
-       key: "cate_id"
-       hist_seq: "tag_brand_list"
-       aux_hist_seq: "tag_category_list"
-    }
-  }
-  }
-  deepfm {
-    dnn {
-      hidden_units: [256, 256, 256]
-    }
-    l2_regularization: 1e-4
-  }
-  embedding_regularization: 1e-5
-}
-
-export_config {
-}
diff --git a/samples/model_config/deepfm_on_sequence_feature_taobao.config b/samples/model_config/deepfm_on_sequence_feature_taobao.config
deleted file mode 100644
index 059e33d7b..000000000
--- a/samples/model_config/deepfm_on_sequence_feature_taobao.config
+++ /dev/null
@@ -1,291 +0,0 @@
-train_input_path: "data/test/tb_data/taobao_train_data"
-eval_input_path: "data/test/tb_data/taobao_test_data"
-model_dir: "experiments/deepfm_on_taobao_ckpt"
-
-train_config {
-  log_step_count_steps: 100
-  optimizer_config: {
-    adam_optimizer: {
-      learning_rate: {
-        exponential_decay_learning_rate {
-          initial_learning_rate: 0.001
-          decay_steps: 1000
-          decay_factor: 0.5
-          min_learning_rate: 0.00001
-        }
-      }
-    }
-    use_moving_average: false
-  }
-  save_checkpoints_steps: 100
-  sync_replicas: True
-  num_steps: 2500
-}
-
-eval_config {
-  metrics_set: {
-    auc {}
-  }
-}
-
-data_config {
-  input_fields {
-    input_name:'clk'
-    input_type: INT32
-  }
-  input_fields {
-    input_name:'buy'
-    input_type: INT32
-  }
-  input_fields {
-    input_name: 'pid'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'adgroup_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'cate_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'campaign_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'customer'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'brand'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'user_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'cms_segid'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'cms_group_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'final_gender_code'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'age_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'pvalue_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'shopping_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'occupation'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'new_user_class_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'tag_category_list'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'tag_brand_list'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'price'
-    input_type: INT32
-  }
-
-  label_fields: 'clk'
-  batch_size: 4096
-  num_epochs: 10000
-  prefetch_size: 32
-  input_type: CSVInput
-}
-
-feature_configs : {
-  input_names: 'pid'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'adgroup_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'cate_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10000
-}
-feature_configs : {
-  input_names: 'campaign_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'customer'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'brand'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'user_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'cms_segid'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100
-}
-feature_configs : {
-  input_names: 'cms_group_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100
-}
-feature_configs : {
-  input_names: 'final_gender_code'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'age_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'pvalue_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'shopping_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'occupation'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'new_user_class_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-   input_names: 'tag_category_list'
-   feature_type: SequenceFeature
-   separator: '|'
-   hash_bucket_size: 10000
-   embedding_dim: 16
-}
-feature_configs : {
-   input_names: 'tag_brand_list'
-   feature_type: SequenceFeature
-   separator: '|'
-   hash_bucket_size: 100000
-   embedding_dim: 16
-}
-feature_configs : {
-  input_names: 'price'
-  feature_type: IdFeature
-  embedding_dim: 16
-  num_buckets: 50
-}
-
-model_config: {
-  model_class: 'DeepFM'
-  feature_groups: {
-    group_name: 'wide'
-    feature_names: 'user_id'
-    feature_names: 'cms_segid'
-    feature_names: 'cms_group_id'
-    feature_names: 'age_level'
-    feature_names: 'pvalue_level'
-    feature_names: 'shopping_level'
-    feature_names: 'occupation'
-    feature_names: 'new_user_class_level'
-    feature_names: 'adgroup_id'
-    feature_names: 'cate_id'
-    feature_names: 'campaign_id'
-    feature_names: 'customer'
-    feature_names: 'brand'
-    feature_names: 'price'
-    feature_names: 'pid'
-    wide_deep: WIDE
-  }
-  feature_groups: {
-    group_name: 'deep'
-    feature_names: 'user_id'
-    feature_names: 'cms_segid'
-    feature_names: 'cms_group_id'
-    feature_names: 'age_level'
-    feature_names: 'pvalue_level'
-    feature_names: 'shopping_level'
-    feature_names: 'occupation'
-    feature_names: 'new_user_class_level'
-    feature_names: 'adgroup_id'
-    feature_names: 'cate_id'
-    feature_names: 'campaign_id'
-    feature_names: 'customer'
-    feature_names: 'brand'
-    feature_names: 'price'
-    feature_names: 'pid'
-    wide_deep: DEEP
-    sequence_features: {
-    group_name: "seq_fea"
-    tf_summary: false
-    seq_att_map: {
-       key: "brand"
-       key: "cate_id"
-       hist_seq: "tag_brand_list"
-       hist_seq: "tag_category_list"
-    }
-  }
-  }
-  deepfm {
-    dnn {
-      hidden_units: [256, 256, 256]
-    }
-    l2_regularization: 1e-4
-  }
-  embedding_regularization: 1e-5
-}
-
-export_config {
-}
diff --git a/samples/model_config/fm_on_sequence_feature_taobao.config b/samples/model_config/fm_on_sequence_feature_taobao.config
deleted file mode 100644
index eb6096acb..000000000
--- a/samples/model_config/fm_on_sequence_feature_taobao.config
+++ /dev/null
@@ -1,288 +0,0 @@
-train_input_path: "data/test/tb_data/taobao_train_data"
-eval_input_path: "data/test/tb_data/taobao_test_data"
-model_dir: "experiments/fm_taobao_ckpt"
-
-train_config {
-  log_step_count_steps: 100
-  optimizer_config: {
-    adam_optimizer: {
-      learning_rate: {
-        exponential_decay_learning_rate {
-          initial_learning_rate: 0.001
-          decay_steps: 1000
-          decay_factor: 0.5
-          min_learning_rate: 0.00001
-        }
-      }
-    }
-    use_moving_average: false
-  }
-  save_checkpoints_steps: 100
-  sync_replicas: True
-  num_steps: 2500
-}
-
-eval_config {
-  metrics_set: {
-    auc {}
-  }
-}
-
-data_config {
-  input_fields {
-    input_name:'clk'
-    input_type: INT32
-  }
-  input_fields {
-    input_name:'buy'
-    input_type: INT32
-  }
-  input_fields {
-    input_name: 'pid'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'adgroup_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'cate_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'campaign_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'customer'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'brand'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'user_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'cms_segid'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'cms_group_id'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'final_gender_code'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'age_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'pvalue_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'shopping_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'occupation'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'new_user_class_level'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'tag_category_list'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'tag_brand_list'
-    input_type: STRING
-  }
-  input_fields {
-    input_name: 'price'
-    input_type: INT32
-  }
-
-  label_fields: 'clk'
-  batch_size: 4096
-  num_epochs: 10000
-  prefetch_size: 32
-  input_type: CSVInput
-}
-
-feature_configs : {
-  input_names: 'pid'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'adgroup_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'cate_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10000
-}
-feature_configs : {
-  input_names: 'campaign_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'customer'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'brand'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'user_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100000
-}
-feature_configs : {
-  input_names: 'cms_segid'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100
-}
-feature_configs : {
-  input_names: 'cms_group_id'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 100
-}
-feature_configs : {
-  input_names: 'final_gender_code'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'age_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'pvalue_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'shopping_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'occupation'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-  input_names: 'new_user_class_level'
-  feature_type: IdFeature
-  embedding_dim: 16
-  hash_bucket_size: 10
-}
-feature_configs : {
-   input_names: 'tag_category_list'
-   feature_type: SequenceFeature
-   separator: '|'
-   hash_bucket_size: 10000
-   embedding_dim: 16
-}
-feature_configs : {
-   input_names: 'tag_brand_list'
-   feature_type: SequenceFeature
-   separator: '|'
-   hash_bucket_size: 100000
-   embedding_dim: 16
-}
-feature_configs : {
-  input_names: 'price'
-  feature_type: IdFeature
-  embedding_dim: 16
-  num_buckets: 50
-}
-
-model_config: {
-  model_class: 'FM'
-  feature_groups: {
-    group_name: 'wide'
-    feature_names: 'user_id'
-    feature_names: 'cms_segid'
-    feature_names: 'cms_group_id'
-    feature_names: 'age_level'
-    feature_names: 'pvalue_level'
-    feature_names: 'shopping_level'
-    feature_names: 'occupation'
-    feature_names: 'new_user_class_level'
-    feature_names: 'adgroup_id'
-    feature_names: 'cate_id'
-    feature_names: 'campaign_id'
-    feature_names: 'customer'
-    feature_names: 'brand'
-    feature_names: 'price'
-    feature_names: 'pid'
-    wide_deep: WIDE
-  }
-  feature_groups: {
-    group_name: 'deep'
-    feature_names: 'user_id'
-    feature_names: 'cms_segid'
-    feature_names: 'cms_group_id'
-    feature_names: 'age_level'
-    feature_names: 'pvalue_level'
-    feature_names: 'shopping_level'
-    feature_names: 'occupation'
-    feature_names: 'new_user_class_level'
-    feature_names: 'adgroup_id'
-    feature_names: 'cate_id'
-    feature_names: 'campaign_id'
-    feature_names: 'customer'
-    feature_names: 'brand'
-    feature_names: 'price'
-    feature_names: 'pid'
-    wide_deep: DEEP
-    sequence_features: {
-    group_name: "seq_fea"
-    tf_summary: false
-    allow_key_search:true
-    seq_att_map: {
-       key: "brand"
-       key: "cate_id"
-       hist_seq: "tag_brand_list"
-       hist_seq: "tag_category_list"
-    }
-  }
-  }
-  fm {
-  }
-  embedding_regularization: 1e-5
-}
-
-export_config {
-}

From 5813c0e0e8c7154d8a9f53745c6adc0cd4b4df50 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Thu, 22 Jun 2023 19:40:17 +0800
Subject: [PATCH 44/54] modify

---
 easy_rec/python/layers/utils.py         | 18 +++--
 easy_rec/python/test/train_eval_test.py |  1 +
 easy_rec/python/utils/config_util.py    | 91 ++++++++++++++++---------
 examples/readme.md                      |  2 +-
 4 files changed, 73 insertions(+), 39 deletions(-)

diff --git a/easy_rec/python/layers/utils.py b/easy_rec/python/layers/utils.py
index 1ba585e07..2af9b855f 100644
--- a/easy_rec/python/layers/utils.py
+++ b/easy_rec/python/layers/utils.py
@@ -19,6 +19,7 @@
 
 import json
 
+from google.protobuf import struct_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import variables
@@ -185,18 +186,25 @@ def l2_regularizer(self, value):
 
   def __getattr__(self, key):
     if self.is_struct:
-      return self.params[key]
+      value = self.params[key]
+      if type(value) == struct_pb2.Struct:
+        return Parameter(value, True, self._l2_reg)
+      else:
+        return value
     return getattr(self.params, key)
 
   def __getitem__(self, key):
-    if self.is_struct:
-      return self.params[key]
-    return getattr(self.params, key)
+    return self.__getattr__(key)
 
   def get_or_default(self, key, def_val):
     if self.is_struct:
       if key in self.params:
-        return self.params[key]
+        if def_val is None:
+          return self.params[key]
+        value = self.params[key]
+        if type(value) == float:
+          return type(def_val)(value)
+        return value
       return def_val
     else:  # pb message
       return getattr(self.params, key)
diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py
index 8f0f25aa1..5680cadb3 100644
--- a/easy_rec/python/test/train_eval_test.py
+++ b/easy_rec/python/test/train_eval_test.py
@@ -960,6 +960,7 @@ def test_distribute_eval_deepfm_multi_cls(self):
 
   def test_distribute_eval_deepfm_single_cls(self):
     cur_eval_path = 'data/test/distribute_eval_test/dwd_distribute_eval_avazu_out_test_combo'
+    #cur_eval_path = '/Users/weisu.yxd/Code/EasyRec/experiments/distribute_eval_test/dwd_distribute_eval_avazu_out_test_combo'
     self._success = test_utils.test_distributed_eval(
         'samples/model_config/deepfm_distribute_eval_combo_on_avazu_ctr.config',
         cur_eval_path, self._test_dir)
diff --git a/easy_rec/python/utils/config_util.py b/easy_rec/python/utils/config_util.py
index 72c050775..9f272919d 100644
--- a/easy_rec/python/utils/config_util.py
+++ b/easy_rec/python/utils/config_util.py
@@ -609,7 +609,7 @@ def process_multi_file_input_path(sampler_config_input_path):
 
 
 def change_configured_embedding_dim(pipeline_config_path, groups, emb_dim):
-  """Reads config from a file containing pipeline_pb2.EasyRecConfig.
+  """Change the embedding dimension of the features in groups.
 
   Args:
     pipeline_config_path: Path to pipeline_pb2.EasyRecConfig text
@@ -622,22 +622,7 @@ def change_configured_embedding_dim(pipeline_config_path, groups, emb_dim):
       `train_input_config`, `eval_config`, `eval_input_config`. Value are the
       corresponding config objects.
   """
-  if isinstance(pipeline_config_path, pipeline_pb2.EasyRecConfig):
-    return pipeline_config_path
-
-  assert tf.gfile.Exists(
-      pipeline_config_path
-  ), 'pipeline_config_path [%s] not exists' % pipeline_config_path
-
-  pipeline_config = pipeline_pb2.EasyRecConfig()
-  with tf.gfile.GFile(pipeline_config_path, 'r') as f:
-    config_str = f.read()
-    if pipeline_config_path.endswith('.config'):
-      text_format.Merge(config_str, pipeline_config)
-    elif pipeline_config_path.endswith('.json'):
-      json_format.Parse(config_str, pipeline_config)
-    else:
-      assert False, 'invalid file format(%s), currently support formats: .config(prototxt) .json' % pipeline_config_path
+  pipeline_config = get_configs_from_pipeline_file(pipeline_config_path, False)
 
   target_groups = set(groups.split(','))
   features = set()
@@ -658,13 +643,50 @@ def change_configured_embedding_dim(pipeline_config_path, groups, emb_dim):
 
   return pipeline_config
 
+def remove_redundant_config(pipeline_config_path):
+  """Remove redundant configs from a file containing pipeline_pb2.EasyRecConfig.
+
+  Args:
+    pipeline_config_path: Path to pipeline_pb2.EasyRecConfig text
+      proto.
+
+  Returns:
+    Dictionary of configuration objects. Keys are `model`, `train_config`,
+      `train_input_config`, `eval_config`, `eval_input_config`. Value are the
+      corresponding config objects.
+  """
+  pipeline_config = get_configs_from_pipeline_file(pipeline_config_path, False)
+
+  features = set()
+  conf = pipeline_config.model_config
+  for group in conf.feature_groups:
+    for feature in group.feature_names:
+      features.add(feature)
+
+  feature_configs = get_compatible_feature_configs(pipeline_config)
+  for fea_conf in feature_configs:
+    fea_name = fea_conf.input_names[0]
+    if fea_conf.HasField('feature_name'):
+      fea_name = fea_conf.feature_name
+    if fea_name not in features:
+      logging.info("redundant feature:" + fea_name)
+      fea_conf.Clear()
+  return pipeline_config
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
-      '--pipeline_config_path',
+    '--cmd',
+    type=str,
+    choices=['format', 'set_emb_dim', 'rm_redundancy'],
+    required=True,
+    help='Path to pipeline config file.')
+  parser.add_argument(
+      '-c', '--pipeline_config_path',
       type=str,
       default=None,
+      required=True,
       help='Path to pipeline config file.')
   parser.add_argument(
       '--feature_groups',
@@ -677,23 +699,26 @@ def change_configured_embedding_dim(pipeline_config_path, groups, emb_dim):
       default=None,
       help='The embedding dim to be changed to.')
   parser.add_argument(
-      '--save_config_path',
+      '-o', '--save_config_path',
       type=str,
       default=None,
+      required=True,
       help='Path to save changed config.')
 
   args, extra_args = parser.parse_known_args()
-  if args.pipeline_config_path is None:
-    raise ValueError('--pipeline_config_path must be set')
-  if args.save_config_path is None:
-    raise ValueError('--save_config_path must be set')
-  if args.feature_groups is None:
-    raise ValueError('--feature_groups must be set')
-  if args.embedding_dim is None:
-    raise ValueError('--embedding_dim must be set')
-
-  # 传入一个不存在的feature group，可以起到format配置文件的效果
-  config = change_configured_embedding_dim(args.pipeline_config_path,
-                                           args.feature_groups,
-                                           args.embedding_dim)
-  save_message(config, args.save_config_path)
+  if args.cmd == 'format':
+    config = get_configs_from_pipeline_file(args.pipeline_config_path)
+    save_message(config, args.save_config_path)
+  elif args.cmd == 'set_emb_dim':
+    if args.feature_groups is None:
+      raise ValueError('--feature_groups must be set')
+    if args.embedding_dim is None:
+      raise ValueError('--embedding_dim must be set')
+
+    config = change_configured_embedding_dim(args.pipeline_config_path,
+                                             args.feature_groups,
+                                             args.embedding_dim)
+    save_message(config, args.save_config_path)
+  elif args.cmd == 'rm_redundancy':
+    config = remove_redundant_config(args.pipeline_config_path)
+    save_message(config, args.save_config_path)
diff --git a/examples/readme.md b/examples/readme.md
index cbf9be600..f2c337431 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -217,7 +217,7 @@ python -m easy_rec.python.train_eval --pipeline_config_path examples/configs/dee
   | DeepFM              | 1     | 0.8867 |
   | DeepFM(Backbone)    | 1     | 0.8872 |
   | DCN                 | 1     | 0.8576 |
-  | DCN (Backbone)      | 1     | 0.8770 |
+  | DCN_v2              | 1     | 0.8770 |
   | AutoInt             | 1     | 0.8513 |
   | MaskNet             | 1     | 0.8872 |
   | FibiNet             | 1     | 0.8893 |

From 0c85dd21ccbec24d7bc7c07ca7fedeacae6046f2 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Sat, 24 Jun 2023 16:12:08 +0800
Subject: [PATCH 45/54] add gate layer

---
 easy_rec/python/layers/keras/__init__.py |  1 +
 easy_rec/python/layers/keras/blocks.py   | 22 +++++++++++++
 easy_rec/python/layers/keras/mask_net.py | 39 +++++++++++++++++++++++-
 easy_rec/python/layers/utils.py          |  4 ++-
 easy_rec/python/protos/layer.proto       |  1 +
 5 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/easy_rec/python/layers/keras/__init__.py b/easy_rec/python/layers/keras/__init__.py
index 24f62ffb3..39d7c8be8 100644
--- a/easy_rec/python/layers/keras/__init__.py
+++ b/easy_rec/python/layers/keras/__init__.py
@@ -1,4 +1,5 @@
 from .blocks import MLP
+from .blocks import Gate
 from .blocks import Highway
 from .bst import BST
 from .din import DIN
diff --git a/easy_rec/python/layers/keras/blocks.py b/easy_rec/python/layers/keras/blocks.py
index 62063e451..38b47abfa 100644
--- a/easy_rec/python/layers/keras/blocks.py
+++ b/easy_rec/python/layers/keras/blocks.py
@@ -134,3 +134,25 @@ def call(self, inputs, training=None, **kwargs):
         activation=self.activation,
         num_layers=self.num_layers,
         dropout=self.dropout_rate if training else 0.0)
+
+
+class Gate(tf.keras.layers.Layer):
+  """Weighted sum gate."""
+
+  def __init__(self, params, name='gate', **kwargs):
+    super(Gate, self).__init__(name, **kwargs)
+    self.weight_index = params.get_or_default("weight_index", 0)
+
+  def call(self, inputs, **kwargs):
+    assert len(inputs) > 1, 'input of Gate layer must be a list containing at least 2 elements'
+    weights = inputs[self.weight_index]
+    j = 0
+    for i, x in enumerate(inputs):
+      if i == self.weight_index:
+        continue
+      if j == 0:
+        output = weights[:, j] * x
+      else:
+        output += weights[:, j] * x
+      j += 1
+    return output
diff --git a/easy_rec/python/layers/keras/mask_net.py b/easy_rec/python/layers/keras/mask_net.py
index ca939bb7e..fa1503b11 100644
--- a/easy_rec/python/layers/keras/mask_net.py
+++ b/easy_rec/python/layers/keras/mask_net.py
@@ -8,10 +8,24 @@
 
 
 class MaskBlock(tf.keras.layers.Layer):
+  """MaskBlock use in MaskNet.
+
+  Args:
+    projection_dim: project dimension to reduce the computational cost.
+    Default is `None` such that a full (`input_dim` by `aggregation_size`) matrix
+    W is used. If enabled, a low-rank matrix W = U*V will be used, where U
+    is of size `input_dim` by `projection_dim` and V is of size
+    `projection_dim` by `aggregation_size`. `projection_dim` need to be smaller
+    than `aggregation_size`/2 to improve the model efficiency. In practice, we've
+    observed that `projection_dim` = d/4 consistently preserved the
+    accuracy of a full-rank version.
+  """
 
   def __init__(self, params, name='mask_block', reuse=None, **kwargs):
     super(MaskBlock, self).__init__(name, **kwargs)
     self.config = params.get_pb_config()
+    self.l2_reg = params.l2_regularizer
+    self._projection_dim = params.get_or_default('projection_dim', None)
     self.reuse = reuse
 
   def call(self, inputs, **kwargs):
@@ -31,13 +45,33 @@ def call(self, inputs, **kwargs):
 
     # initializer = tf.initializers.variance_scaling()
     initializer = tf.glorot_uniform_initializer()
-    mask = tf.layers.dense(
+
+    if self._projection_dim is None:
+      mask = tf.layers.dense(
         mask_input,
         aggregation_size,
         activation=tf.nn.relu,
         kernel_initializer=initializer,
+        kernel_regularizer=self.l2_reg,
         name='%s/hidden' % self.name,
         reuse=self.reuse)
+    else:
+      u = tf.layers.dense(
+        mask_input,
+        self._projection_dim,
+        kernel_initializer=initializer,
+        kernel_regularizer=self.l2_reg,
+        use_bias=False,
+        name='%s/prj_u' % self.name,
+        reuse=self.reuse)
+      mask = tf.layers.dense(
+        u,
+        aggregation_size,
+        activation=tf.nn.relu,
+        kernel_initializer=initializer,
+        kernel_regularizer=self.l2_reg,
+        name='%s/prj_v' % self.name,
+        reuse=self.reuse)
     mask = tf.layers.dense(
         mask, net.shape[-1], name='%s/mask' % self.name, reuse=self.reuse)
     masked_net = net * mask
@@ -62,6 +96,7 @@ class MaskNet(tf.keras.layers.Layer):
 
   def __init__(self, params, name='mask_net', **kwargs):
     super(MaskNet, self).__init__(name, **kwargs)
+    self.params = params
     self.config = params.get_pb_config()
     if self.config.HasField('mlp'):
       p = Parameter.make_from_pb(self.config.mlp)
@@ -75,6 +110,7 @@ def call(self, inputs, training=None, **kwargs):
       mask_outputs = []
       for i, block_conf in enumerate(self.config.mask_blocks):
         params = Parameter.make_from_pb(block_conf)
+        params.l2_regularizer = self.params.l2_regularizer
         mask_layer = MaskBlock(params, name='%s/block_%d' % (self.name, i))
         mask_outputs.append(mask_layer((inputs, inputs)))
       all_mask_outputs = tf.concat(mask_outputs, axis=1)
@@ -88,6 +124,7 @@ def call(self, inputs, training=None, **kwargs):
       net = inputs
       for i, block_conf in enumerate(self.config.mask_blocks):
         params = Parameter.make_from_pb(block_conf)
+        params.l2_regularizer = self.params.l2_regularizer
         mask_layer = MaskBlock(params, name='%s/block_%d' % (self.name, i))
         net = mask_layer((net, inputs))
 
diff --git a/easy_rec/python/layers/utils.py b/easy_rec/python/layers/utils.py
index 2af9b855f..705b1be90 100644
--- a/easy_rec/python/layers/utils.py
+++ b/easy_rec/python/layers/utils.py
@@ -207,7 +207,9 @@ def get_or_default(self, key, def_val):
         return value
       return def_val
     else:  # pb message
-      return getattr(self.params, key)
+      if self.params.HasField(key):
+        return getattr(self.params, key)
+      return def_val
 
   def check_required(self, keys):
     if not self.is_struct:
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index 9a1e40acb..c7349c2ac 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -52,6 +52,7 @@ message MaskBlock {
     required uint32 output_size = 2;
     optional uint32 aggregation_size = 3;
     optional bool input_layer_norm = 4 [default = true];
+    optional uint32 projection_dim = 5;
 }
 
 message MaskNet {

From 3ed293a0a2662312c5d94cbbbc4564d6c9acfc6a Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Sat, 24 Jun 2023 19:56:02 +0800
Subject: [PATCH 46/54] add gate layer

---
 easy_rec/python/input/input.py         |  8 ++++--
 easy_rec/python/layers/input_layer.py  |  9 +++++--
 easy_rec/python/layers/keras/blocks.py |  4 +--
 easy_rec/python/layers/utils.py        |  4 +--
 easy_rec/python/utils/config_util.py   | 35 ++++++++++++++++++++++----
 pai_jobs/run.py                        |  7 +++++-
 6 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py
index 5cdaa1dd1..9b8c4b3b0 100644
--- a/easy_rec/python/input/input.py
+++ b/easy_rec/python/input/input.py
@@ -1,9 +1,11 @@
 # -*- encoding:utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import logging
+import os
 from abc import abstractmethod
 from collections import OrderedDict
 
+from easy_rec.python.utils import conditional
 import six
 import tensorflow as tf
 from tensorflow.python.framework import ops
@@ -1012,12 +1014,14 @@ def _input_fn(mode=None, params=None, config=None):
         dataset = self._build(mode, params)
         return dataset
       elif mode is None:  # serving_input_receiver_fn for export SavedModel
+        place_on_cpu = os.getenv('place_embedding_on_cpu')
+        place_on_cpu = eval(place_on_cpu) if place_on_cpu else False
         if export_config.multi_placeholder:
-          with ops.device('/CPU:0'):
+          with conditional(place_on_cpu, ops.device('/CPU:0')):
             inputs, features = self.create_multi_placeholders(export_config)
           return tf.estimator.export.ServingInputReceiver(features, inputs)
         else:
-          with ops.device('/CPU:0'):
+          with conditional(place_on_cpu, ops.device('/CPU:0')):
             inputs, features = self.create_placeholders(export_config)
           print('built feature placeholders. features: {}'.format(
               features.keys()))
diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py
index df1a17b25..4c36811fa 100644
--- a/easy_rec/python/layers/input_layer.py
+++ b/easy_rec/python/layers/input_layer.py
@@ -1,6 +1,7 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import logging
+import os
 from collections import OrderedDict
 
 import tensorflow as tf
@@ -96,7 +97,9 @@ def __call__(self, features, group_name, is_combine=True, is_dict=False):
     feature_name_to_output_tensors = {}
     negative_sampler = self._feature_groups[group_name]._config.negative_sampler
     if is_combine:
-      with conditional(self._is_predicting, ops.device('/CPU:0')):
+      place_on_cpu = os.getenv('place_embedding_on_cpu')
+      place_on_cpu = eval(place_on_cpu) if place_on_cpu else False
+      with conditional(self._is_predicting and place_on_cpu, ops.device('/CPU:0')):
         concat_features, group_features = self.single_call_input_layer(
             features, group_name, feature_name_to_output_tensors)
       if group_name in self._group_name_to_seq_features:
@@ -194,7 +197,9 @@ def single_call_input_layer(self,
     for column in sorted(group_seq_columns, key=lambda x: x.name):
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name):
-        with conditional(self._is_predicting, ops.device('/CPU:0')):
+        place_on_cpu = os.getenv('place_embedding_on_cpu')
+        place_on_cpu = eval(place_on_cpu) if place_on_cpu else False
+        with conditional(self._is_predicting and place_on_cpu, ops.device('/CPU:0')):
           seq_feature, seq_len = column._get_sequence_dense_tensor(builder)
         embedding_reg_lst.append(seq_feature)
 
diff --git a/easy_rec/python/layers/keras/blocks.py b/easy_rec/python/layers/keras/blocks.py
index 38b47abfa..1a6715a8e 100644
--- a/easy_rec/python/layers/keras/blocks.py
+++ b/easy_rec/python/layers/keras/blocks.py
@@ -151,8 +151,8 @@ def call(self, inputs, **kwargs):
       if i == self.weight_index:
         continue
       if j == 0:
-        output = weights[:, j] * x
+        output = weights[:, j, None] * x
       else:
-        output += weights[:, j] * x
+        output += weights[:, j, None] * x
       j += 1
     return output
diff --git a/easy_rec/python/layers/utils.py b/easy_rec/python/layers/utils.py
index 705b1be90..2af9b855f 100644
--- a/easy_rec/python/layers/utils.py
+++ b/easy_rec/python/layers/utils.py
@@ -207,9 +207,7 @@ def get_or_default(self, key, def_val):
         return value
       return def_val
     else:  # pb message
-      if self.params.HasField(key):
-        return getattr(self.params, key)
-      return def_val
+      return getattr(self.params, key)
 
   def check_required(self, keys):
     if not self.is_struct:
diff --git a/easy_rec/python/utils/config_util.py b/easy_rec/python/utils/config_util.py
index 9f272919d..e35175be9 100644
--- a/easy_rec/python/utils/config_util.py
+++ b/easy_rec/python/utils/config_util.py
@@ -643,12 +643,14 @@ def change_configured_embedding_dim(pipeline_config_path, groups, emb_dim):
 
   return pipeline_config
 
-def remove_redundant_config(pipeline_config_path):
+
+def remove_redundant_config(pipeline_config_path, remove_input=False):
   """Remove redundant configs from a file containing pipeline_pb2.EasyRecConfig.
 
   Args:
     pipeline_config_path: Path to pipeline_pb2.EasyRecConfig text
       proto.
+    remove_input: whether to remove input configs
 
   Returns:
     Dictionary of configuration objects. Keys are `model`, `train_config`,
@@ -657,6 +659,7 @@ def remove_redundant_config(pipeline_config_path):
   """
   pipeline_config = get_configs_from_pipeline_file(pipeline_config_path, False)
 
+  inputs = set()
   features = set()
   conf = pipeline_config.model_config
   for group in conf.feature_groups:
@@ -664,13 +667,30 @@ def remove_redundant_config(pipeline_config_path):
       features.add(feature)
 
   feature_configs = get_compatible_feature_configs(pipeline_config)
-  for fea_conf in feature_configs:
+  offset = 0
+  for i in range(len(feature_configs)):
+    fea_conf = feature_configs[i - offset]
     fea_name = fea_conf.input_names[0]
     if fea_conf.HasField('feature_name'):
       fea_name = fea_conf.feature_name
     if fea_name not in features:
       logging.info("redundant feature:" + fea_name)
-      fea_conf.Clear()
+      del feature_configs[i - offset]
+      offset += 1
+    elif remove_input:
+      for input_name in fea_conf.input_names:
+        inputs.add(input_name)
+
+  if remove_input:
+    for label in pipeline_config.data_config.label_fields:
+      inputs.add(label)
+    input_fields = pipeline_config.data_config.input_fields
+    offset = 0
+    for i in range(len(input_fields)):
+      field = input_fields[i - offset]
+      if field.input_name not in inputs:
+        del input_fields[i - offset]
+        offset += 1
   return pipeline_config
 
 
@@ -689,12 +709,17 @@ def remove_redundant_config(pipeline_config_path):
       required=True,
       help='Path to pipeline config file.')
   parser.add_argument(
-      '--feature_groups',
+      '-g', '--feature_groups',
       type=str,
       default=None,
       help='The name of feature group to be changed.')
   parser.add_argument(
-      '--embedding_dim',
+      '--rm_input',
+      type=bool,
+      default=False,
+      help='Whether to remove redundancy input.')
+  parser.add_argument(
+      '-d', '--embedding_dim',
       type=int,
       default=None,
       help='The embedding dim to be changed to.')
diff --git a/pai_jobs/run.py b/pai_jobs/run.py
index 41c61ad31..986731d36 100644
--- a/pai_jobs/run.py
+++ b/pai_jobs/run.py
@@ -166,6 +166,8 @@
 tf.app.flags.DEFINE_string('oss_embedding_version', '', 'oss embedding version')
 
 tf.app.flags.DEFINE_bool('verbose', False, 'print more debug information')
+tf.app.flags.DEFINE_bool('place_embedding_on_cpu', False,
+                         'whether to place embedding variables on cpu')
 
 # for automl hyper parameter tuning
 tf.app.flags.DEFINE_string('model_dir', None, 'model directory')
@@ -434,7 +436,10 @@ def main(argv):
   elif FLAGS.cmd == 'export':
     check_param('export_dir')
     check_param('config')
-
+    if FLAGS.place_embedding_on_cpu:
+      os.environ['place_embedding_on_cpu'] = 'True'
+    else:
+      os.environ['place_embedding_on_cpu'] = 'False'
     redis_params = {}
     if FLAGS.redis_url:
       redis_params['redis_url'] = FLAGS.redis_url

From a9aff757ed9c220c2ea2d83c246629564d0b3064 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Sun, 25 Jun 2023 21:48:39 +0800
Subject: [PATCH 47/54] add gate layer

---
 easy_rec/python/layers/common_layers.py     | 18 ------------------
 easy_rec/python/layers/utils.py             |  8 +++++++-
 easy_rec/python/protos/feature_config.proto |  1 -
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/easy_rec/python/layers/common_layers.py b/easy_rec/python/layers/common_layers.py
index dd39d8259..fae4fe3fc 100644
--- a/easy_rec/python/layers/common_layers.py
+++ b/easy_rec/python/layers/common_layers.py
@@ -144,21 +144,3 @@ def call(self, group, is_training):
     if self._config.output_2d_tensor_and_feature_list:
       return features, feature_list
     return features
-
-
-class Concatenate(object):
-
-  def __init__(self, config):
-    self.config = config
-
-  def __call__(self, inputs, *args, **kwargs):
-    if self.config.HasField('expand_dim_before'):
-      dim = self.config.expand_dim_before
-      output = tf.stack(inputs, axis=dim)
-    else:
-      output = tf.concat(inputs, axis=self.config.axis)
-
-    if self.config.HasField('expand_dim_after'):
-      dim = self.config.expand_dim_after
-      output = tf.expand_dims(output, dim)
-    return output
diff --git a/easy_rec/python/layers/utils.py b/easy_rec/python/layers/utils.py
index 2af9b855f..b95eef2fe 100644
--- a/easy_rec/python/layers/utils.py
+++ b/easy_rec/python/layers/utils.py
@@ -207,7 +207,13 @@ def get_or_default(self, key, def_val):
         return value
       return def_val
     else:  # pb message
-      return getattr(self.params, key)
+      value = getattr(self.params, key)
+      if hasattr(value, '__len__'):
+        if len(value) > 0:
+          return value
+      elif self.params.HasField(key):
+        return value
+      return def_val
 
   def check_required(self, keys):
     if not self.is_struct:
diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto
index ee245b0e9..75d49a15c 100644
--- a/easy_rec/python/protos/feature_config.proto
+++ b/easy_rec/python/protos/feature_config.proto
@@ -3,7 +3,6 @@ package protos;
 
 import "easy_rec/python/protos/hyperparams.proto";
 import "easy_rec/python/protos/dnn.proto";
-import "easy_rec/python/protos/seq_encoder.proto";
 enum WideOrDeep {
     DEEP = 0;
     WIDE = 1;

From 28473476c60cdbd6ea82eb280781f37c4f8e3eec Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Wed, 28 Jun 2023 15:33:08 +0800
Subject: [PATCH 48/54] add block package for reuse sub network

---
 easy_rec/python/layers/backbone.py            | 153 ++++++++----
 easy_rec/python/protos/backbone.proto         |  30 ++-
 easy_rec/python/protos/easy_rec_model.proto   |   1 +
 .../configs/dcn_backbone_on_movielens.config  |   5 +-
 .../configs/deepfm_backbone_on_criteo.config  |  10 +-
 ...pfm_backbone_on_criteo_with_autodis.config |  16 +-
 ...fm_backbone_on_criteo_with_periodic.config |  16 +-
 .../deepfm_backbone_on_movielens.config       |  16 +-
 .../configs/dlrm_backbone_on_criteo.config    |  12 +-
 .../dlrm_on_criteo_with_autodis.config        |  14 +-
 .../dlrm_on_criteo_with_periodic.config       |  14 +-
 .../configs/dlrm_standard_on_criteo.config    |  10 +-
 examples/configs/fibinet_on_movielens.config  |   6 +-
 examples/configs/masknet_on_movielens.config  |   3 +-
 examples/configs/mlp_on_movielens.config      |   2 +-
 .../configs/multi_tower_on_movielens.config   | 223 ++++++++++++++++++
 ...wide_and_deep_backbone_on_movielens.config |   9 +-
 17 files changed, 438 insertions(+), 102 deletions(-)
 create mode 100644 examples/configs/multi_tower_on_movielens.config

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index b673a209a..414c667fb 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -17,32 +17,9 @@
   tf = tf.compat.v1
 
 
-def block_input(config, block_outputs):
-  inputs = []
-  for input_node in config.inputs:
-    input_name = input_node.name
-    if input_name in block_outputs:
-      input_feature = block_outputs[input_name]
-    else:
-      raise KeyError('input name `%s` does not exists' % input_name)
-    if input_node.HasField('input_fn'):
-      fn = eval(input_node.input_fn)
-      input_feature = fn(input_feature)
-    inputs.append(input_feature)
-
-  if config.merge_inputs_into_list:
-    output = inputs
-  else:
-    output = concat_inputs(inputs, config.input_concat_axis, config.name)
-
-  if config.HasField('extra_input_fn'):
-    fn = eval(config.extra_input_fn)
-    output = fn(output)
-  return output
-
-
-class Backbone(object):
-  """Configurable Backbone Network."""
+class Package(object):
+  """A sub DAG of tf ops for reuse."""
+  __packages = {}
 
   def __init__(self, config, features, input_layer, l2_reg=None):
     self._config = config
@@ -54,22 +31,27 @@ def __init__(self, config, features, input_layer, l2_reg=None):
     self.loss_dict = {}
     input_feature_groups = set()
     for block in config.blocks:
+      if len(block.inputs) == 0:
+        raise ValueError('block takes at least one input: %s' % block.name)
       self._dag.add_node(block.name)
       self._name_to_blocks[block.name] = block
       layer = block.WhichOneof('layer')
       if layer == 'input_layer':
-        if len(block.inputs) != 0:
-          raise ValueError('no input allowed for input_layer: ' + block.name)
-        input_name = block.name
-        if not input_layer.has_group(input_name):
+        if len(block.inputs) != 1:
+          raise ValueError('input layer `%s` takes only one input' % block.name)
+        one_input = block.inputs[0]
+        name = one_input.WhichOneof('name')
+        if name != 'feature_group_name':
           raise KeyError(
-              'input_layer\'s name must be one of feature group, invalid: ' +
-              input_name)
+            '`feature_group_name` should be set for input layer: ' +
+            block.name)
+        input_name = one_input.feature_group_name
+        if not input_layer.has_group(input_name):
+          raise KeyError('invalid feature group name: ' + input_name)
         if input_name in input_feature_groups:
-          raise ValueError('input `%s` already exists in other block' %
-                           input_name)
-        else:
-          input_feature_groups.add(input_name)
+          logging.warning('input `%s` already exists in other block' %
+                          input_name)
+        input_feature_groups.add(input_name)
 
     num_groups = len(input_feature_groups)
     num_blocks = len(self._name_to_blocks) - num_groups
@@ -82,10 +64,8 @@ def __init__(self, config, features, input_layer, l2_reg=None):
       if block.name in input_feature_groups:
         raise KeyError('block name can not be one of feature groups:' +
                        block.name)
-      assert len(block.inputs) > 0, 'no input for block: %s' % block.name
-
       for input_node in block.inputs:
-        input_name = input_node.name
+        input_name = getattr(input_node, input_node.WhichOneof('name'))
         if input_name in self._name_to_blocks:
           assert input_name != block.name, 'input name can not equal to block name:' + input_name
           self._dag.add_edge(input_name, block.name)
@@ -94,19 +74,58 @@ def __init__(self, config, features, input_layer, l2_reg=None):
             logging.info('adding an input_layer block: ' + input_name)
             new_block = backbone_pb2.Block()
             new_block.name = input_name
+            input_cfg = backbone_pb2.Input()
+            input_cfg.feature_group_name = input_name
+            new_block.inputs.append(input_cfg)
             new_block.input_layer.CopyFrom(backbone_pb2.InputLayer())
             self._name_to_blocks[input_name] = new_block
             self._dag.add_node(input_name)
             self._dag.add_edge(input_name, block.name)
-            input_feature_groups.add(block.name)
+            input_feature_groups.add(input_name)
           else:
             raise KeyError(
-                'invalid input name `%s`, must be the name of either a feature group or an another block'
-                % input_name)
+              'invalid input name `%s`, must be the name of either a feature group or an another block'
+              % input_name)
     num_groups = len(input_feature_groups)
     assert num_groups > 0, 'there must be at least one input layer'
+    Package.__packages[self._config.name] = self
+
+  def block_input(self, config, block_outputs, training=None):
+    inputs = []
+    for input_node in config.inputs:
+      input_type = input_node.WhichOneof('name')
+      input_name = getattr(input_node, input_type)
+      if input_type == 'package_name':
+        if input_name not in Package.__packages:
+          raise KeyError('package name `%s` does not exists' % input_name)
+        package = Package.__packages[input_name]
+        input_feature = package(training)
+        if len(package.loss_dict) > 0:
+          self.loss_dict.update(package.loss_dict)
+      elif input_name in block_outputs:
+        input_feature = block_outputs[input_name]
+      else:
+        raise KeyError('input name `%s` does not exists' % input_name)
+      if input_node.HasField('input_fn'):
+        fn = eval(input_node.input_fn)
+        input_feature = fn(input_feature)
+      inputs.append(input_feature)
+
+    if config.merge_inputs_into_list:
+      output = inputs
+    else:
+      output = concat_inputs(inputs, config.input_concat_axis, config.name)
+
+    if config.HasField('extra_input_fn'):
+      fn = eval(config.extra_input_fn)
+      output = fn(output)
+    return output
 
   def __call__(self, is_training, **kwargs):
+    with tf.variable_scope(self._config.name, reuse=tf.AUTO_REUSE):
+      return self.call(is_training)
+
+  def call(self, is_training):
     block_outputs = {}
     blocks = self._dag.topological_sort()
     logging.info('backbone topological order: ' + ','.join(blocks))
@@ -115,7 +134,7 @@ def __call__(self, is_training, **kwargs):
       config = self._name_to_blocks[block]
       if config.layers:  # sequential layers
         logging.info('call sequential %d layers' % len(config.layers))
-        output = block_input(config, block_outputs)
+        output = self.block_input(config, block_outputs, is_training)
         for layer in config.layers:
           output = self.call_layer(output, layer, block, is_training)
         block_outputs[block] = output
@@ -123,14 +142,14 @@ def __call__(self, is_training, **kwargs):
       # just one of layer
       layer = config.WhichOneof('layer')
       if layer is None:  # identity layer
-        block_outputs[block] = block_input(config, block_outputs)
+        block_outputs[block] = self.block_input(config, block_outputs, is_training)
       elif layer == 'input_layer':
         conf = config.input_layer
         input_fn = EnhancedInputLayer(conf, self._input_layer, self._features)
         output = input_fn(block, is_training)
         block_outputs[block] = output
       else:
-        inputs = block_input(config, block_outputs)
+        inputs = self.block_input(config, block_outputs, is_training)
         output = self.call_layer(inputs, config, block, is_training)
         block_outputs[block] = output
 
@@ -146,11 +165,11 @@ def __call__(self, is_training, **kwargs):
         raise ValueError('No output `%s` of backbone to be concat' % output)
     output = concat_inputs(outputs, msg='backbone')
 
-    if self._config.HasField('top_mlp'):
-      params = Parameter.make_from_pb(self._config.top_mlp)
-      params.l2_regularizer = self._l2_reg
-      final_mlp = MLP(params, name='backbone_top_mlp')
-      output = final_mlp(output, training=is_training)
+    # if self._config.HasField('top_mlp'):
+    #   params = Parameter.make_from_pb(self._config.top_mlp)
+    #   params.l2_regularizer = self._l2_reg
+    #   final_mlp = MLP(params, name='backbone_top_mlp')
+    #   output = final_mlp(output, training=is_training)
     return output
 
   def call_keras_layer(self, layer_conf, inputs, name, training):
@@ -185,7 +204,10 @@ def call_keras_layer(self, layer_conf, inputs, name, training):
           logging.info('try to call %s layer with params %r' %
                        (layer_conf.class_name, args))
           layer = layer_cls(*args, name=name)
-      return layer(inputs, training=training)
+      try:
+        return layer(inputs, training=training)
+      except TypeError:
+        return layer(inputs)
 
   def call_layer(self, inputs, config, name, training):
     layer_name = config.WhichOneof('layer')
@@ -243,6 +265,35 @@ def call_layer(self, inputs, config, name, training):
     raise NotImplementedError('Unsupported backbone layer:' + layer_name)
 
 
+class Backbone(object):
+  """Configurable Backbone Network."""
+
+  def __init__(self, config, features, input_layer, l2_reg=None):
+    self._config = config
+    self._l2_reg = l2_reg
+    self.loss_dict = {}
+    for pkg in config.packages:
+      Package(pkg, features, input_layer, l2_reg)
+
+    main_pkg = backbone_pb2.BlockPackage()
+    main_pkg.name = 'backbone'
+    main_pkg.blocks.MergeFrom(config.blocks)
+    main_pkg.concat_blocks.extend(config.concat_blocks)
+    self._main_pkg = Package(main_pkg, features, input_layer, l2_reg)
+
+  def __call__(self, is_training, **kwargs):
+    output = self._main_pkg(is_training, **kwargs)
+    if len(self._main_pkg.loss_dict) > 0:
+      self.loss_dict = self._main_pkg.loss_dict
+
+    if self._config.HasField('top_mlp'):
+      params = Parameter.make_from_pb(self._config.top_mlp)
+      params.l2_regularizer = self._l2_reg
+      final_mlp = MLP(params, name='backbone_top_mlp')
+      output = final_mlp(output, training=is_training)
+    return output
+
+
 def concat_inputs(inputs, axis=-1, msg=''):
   if len(inputs) > 1:
     if all(map(lambda x: type(x) == list, inputs)):
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
index 5e2262707..d73799707 100644
--- a/easy_rec/python/protos/backbone.proto
+++ b/easy_rec/python/protos/backbone.proto
@@ -20,8 +20,12 @@ message Lambda {
 }
 
 message Input {
-    required string name = 1;
-    optional string input_fn = 2;
+    optional string input_fn = 1;
+    oneof name {
+        string feature_group_name = 2;
+        string block_name = 3;
+        string package_name = 4;
+    }
 }
 
 message RecurrentLayer {
@@ -56,6 +60,7 @@ message Block {
 
     // sequential layers
     repeated Layer layers = 6;
+
     // only take effect when there are no layers
     oneof layer {
         InputLayer input_layer = 101;
@@ -66,8 +71,23 @@ message Block {
     }
 }
 
+// a package of blocks for reuse; e.g. call in a contrastive learning manner
+message BlockPackage {
+    // package name
+    required string name = 1;
+    // a few blocks generating a DAG
+    repeated Block blocks = 2;
+    // the names of output blocks
+    repeated string concat_blocks = 3;
+}
+
 message BackboneTower {
-    repeated Block blocks = 1;
-    repeated string concat_blocks = 2;
-    optional MLP top_mlp = 3;
+    // a few sub DAGs
+    repeated BlockPackage packages = 1;
+    // a few blocks generating a DAG
+    repeated Block blocks = 2;
+    // the names of output blocks
+    repeated string concat_blocks = 3;
+    // optional top mlp layer
+    optional MLP top_mlp = 4;
 }
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 2bb801847..21ac685d3 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -55,6 +55,7 @@ message KD {
 }
 
 message EasyRecModel {
+    required string model_name = 99;
     required string model_class = 1;
 
     // actually input layers, each layer produce a group of feature
diff --git a/examples/configs/dcn_backbone_on_movielens.config b/examples/configs/dcn_backbone_on_movielens.config
index 9c84794dd..3376db96f 100644
--- a/examples/configs/dcn_backbone_on_movielens.config
+++ b/examples/configs/dcn_backbone_on_movielens.config
@@ -148,6 +148,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'DCN v2'
   model_class: 'RankModel'
   feature_groups: {
     group_name: 'all'
@@ -164,7 +165,7 @@ model_config: {
     blocks {
       name: "deep"
       inputs {
-        name: 'all'
+        feature_group_name: 'all'
       }
       keras_layer {
         class_name: 'MLP'
@@ -176,7 +177,7 @@ model_config: {
     blocks {
       name: "dcn"
       inputs {
-        name: 'all'
+        feature_group_name: 'all'
         input_fn: 'lambda x: [x, x]'
       }
       recurrent {
diff --git a/examples/configs/deepfm_backbone_on_criteo.config b/examples/configs/deepfm_backbone_on_criteo.config
index 9cba3fb82..06c60f966 100644
--- a/examples/configs/deepfm_backbone_on_criteo.config
+++ b/examples/configs/deepfm_backbone_on_criteo.config
@@ -486,6 +486,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'DeepFM'
   model_class: 'RankModel'
   feature_groups: {
     group_name: "deep_features"
@@ -577,7 +578,7 @@ model_config: {
     blocks {
       name: 'wide_logit'
       inputs {
-        name: 'wide_features'
+        feature_group_name: 'wide_features'
       }
       lambda {
         expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
@@ -585,6 +586,9 @@ model_config: {
     }
     blocks {
       name: 'deep_features'
+      inputs {
+        feature_group_name: 'deep_features'
+      }
       input_layer {
         output_2d_tensor_and_feature_list: true
       }
@@ -592,7 +596,7 @@ model_config: {
     blocks {
       name: 'fm'
       inputs {
-        name: 'deep_features'
+        block_name: 'deep_features'
         input_fn: 'lambda x: x[1]'
       }
       keras_layer {
@@ -608,7 +612,7 @@ model_config: {
     blocks {
       name: 'deep'
       inputs {
-        name: 'deep_features'
+        block_name: 'deep_features'
         input_fn: 'lambda x: x[0]'
       }
       keras_layer {
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
index 49fcf8e38..9d1856cae 100644
--- a/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
+++ b/examples/configs/deepfm_backbone_on_criteo_with_autodis.config
@@ -577,6 +577,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'DeepFM with AutoDis'
   model_class: 'RankModel'
   feature_groups: {
     group_name: "numerical_features"
@@ -672,7 +673,7 @@ model_config: {
     blocks {
       name: 'wide_logit'
       inputs {
-        name: 'wide_features'
+        feature_group_name: 'wide_features'
       }
       lambda {
         expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
@@ -681,7 +682,7 @@ model_config: {
     blocks {
       name: 'num_emb'
       inputs {
-        name: 'numerical_features'
+        feature_group_name: 'numerical_features'
       }
       keras_layer {
         class_name: 'AutoDisEmbedding'
@@ -695,6 +696,9 @@ model_config: {
     }
     blocks {
       name: 'categorical_features'
+      inputs {
+        feature_group_name: 'categorical_features'
+      }
       input_layer {
         output_2d_tensor_and_feature_list: true
       }
@@ -702,11 +706,11 @@ model_config: {
     blocks {
       name: 'fm'
       inputs {
-        name: 'categorical_features'
+        block_name: 'categorical_features'
         input_fn: 'lambda x: x[1]'
       }
       inputs {
-        name: 'num_emb'
+        block_name: 'num_emb'
         input_fn: 'lambda x: x[1]'
       }
       keras_layer {
@@ -719,11 +723,11 @@ model_config: {
     blocks {
       name: 'deep'
       inputs {
-        name: 'categorical_features'
+        block_name: 'categorical_features'
         input_fn: 'lambda x: x[0]'
       }
       inputs {
-        name: 'num_emb'
+        block_name: 'num_emb'
         input_fn: 'lambda x: x[0]'
       }
       keras_layer {
diff --git a/examples/configs/deepfm_backbone_on_criteo_with_periodic.config b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
index 2f2f8435b..3ce65c8bf 100644
--- a/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
+++ b/examples/configs/deepfm_backbone_on_criteo_with_periodic.config
@@ -577,6 +577,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'DeepFM with Periodic'
   model_class: 'RankModel'
   feature_groups: {
     group_name: "numerical_features"
@@ -672,7 +673,7 @@ model_config: {
     blocks {
       name: 'wide_logit'
       inputs {
-        name: 'wide_features'
+        feature_group_name: 'wide_features'
       }
       lambda {
         expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
@@ -681,7 +682,7 @@ model_config: {
     blocks {
       name: 'num_emb'
       inputs {
-        name: 'numerical_features'
+        feature_group_name: 'numerical_features'
       }
       keras_layer {
         class_name: 'PeriodicEmbedding'
@@ -694,6 +695,9 @@ model_config: {
     }
     blocks {
       name: 'categorical_features'
+      inputs {
+        feature_group_name: 'categorical_features'
+      }
       input_layer {
         output_2d_tensor_and_feature_list: true
       }
@@ -701,11 +705,11 @@ model_config: {
     blocks {
       name: 'fm'
       inputs {
-        name: 'categorical_features'
+        block_name: 'categorical_features'
         input_fn: 'lambda x: x[1]'
       }
       inputs {
-        name: 'num_emb'
+        block_name: 'num_emb'
         input_fn: 'lambda x: x[1]'
       }
       keras_layer {
@@ -718,11 +722,11 @@ model_config: {
     blocks {
       name: 'deep'
       inputs {
-        name: 'categorical_features'
+        block_name: 'categorical_features'
         input_fn: 'lambda x: x[0]'
       }
       inputs {
-        name: 'num_emb'
+        block_name: 'num_emb'
         input_fn: 'lambda x: x[0]'
       }
       keras_layer {
diff --git a/examples/configs/deepfm_backbone_on_movielens.config b/examples/configs/deepfm_backbone_on_movielens.config
index c6bf82151..36ef7ace3 100644
--- a/examples/configs/deepfm_backbone_on_movielens.config
+++ b/examples/configs/deepfm_backbone_on_movielens.config
@@ -148,6 +148,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'DeepFM'
   model_class: 'RankModel'
   feature_groups: {
     group_name: 'wide'
@@ -176,7 +177,7 @@ model_config: {
     blocks {
       name: 'wide_logit'
       inputs {
-        name: 'wide'
+        feature_group_name: 'wide'
       }
       lambda {
         expression: 'lambda x: tf.reduce_sum(x, axis=1, keepdims=True)'
@@ -184,6 +185,9 @@ model_config: {
     }
     blocks {
       name: 'features'
+      inputs {
+        feature_group_name: 'features'
+      }
       input_layer {
         output_2d_tensor_and_feature_list: true
       }
@@ -191,7 +195,7 @@ model_config: {
     blocks {
       name: 'fm'
       inputs {
-        name: 'features'
+        block_name: 'features'
         input_fn: 'lambda x: x[1]'
       }
       keras_layer {
@@ -201,7 +205,7 @@ model_config: {
     blocks {
       name: 'deep'
       inputs {
-        name: 'features'
+        block_name: 'features'
         input_fn: 'lambda x: x[0]'
       }
       keras_layer {
@@ -216,13 +220,13 @@ model_config: {
     blocks {
       name: 'add'
       inputs {
-        name: 'wide_logit'
+        block_name: 'wide_logit'
       }
       inputs {
-        name: 'fm'
+        block_name: 'fm'
       }
       inputs {
-        name: 'deep'
+        block_name: 'deep'
       }
       merge_inputs_into_list: true
       keras_layer {
diff --git a/examples/configs/dlrm_backbone_on_criteo.config b/examples/configs/dlrm_backbone_on_criteo.config
index afdc0f784..6dc5dd41e 100644
--- a/examples/configs/dlrm_backbone_on_criteo.config
+++ b/examples/configs/dlrm_backbone_on_criteo.config
@@ -474,6 +474,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'DLRM'
   model_class: 'RankModel'
   feature_groups: {
     group_name: "dense"
@@ -526,7 +527,7 @@ model_config: {
     blocks {
       name: 'bottom_mlp'
       inputs {
-        name: 'dense'
+        feature_group_name: 'dense'
       }
       keras_layer {
         class_name: 'MLP'
@@ -537,6 +538,9 @@ model_config: {
     }
     blocks {
       name: 'sparse'
+      inputs {
+        feature_group_name: 'sparse'
+      }
       input_layer {
         output_2d_tensor_and_feature_list: true
       }
@@ -544,11 +548,11 @@ model_config: {
     blocks {
       name: 'dot'
       inputs {
-        name: 'bottom_mlp'
+        block_name: 'bottom_mlp'
         input_fn: 'lambda x: [x]'
       }
       inputs {
-        name: 'sparse'
+        block_name: 'sparse'
         input_fn: 'lambda x: x[1]'
       }
       keras_layer {
@@ -558,7 +562,7 @@ model_config: {
     blocks {
       name: 'sparse_2d'
       inputs {
-        name: 'sparse'
+        block_name: 'sparse'
         input_fn: 'lambda x: x[0]'
       }
     }
diff --git a/examples/configs/dlrm_on_criteo_with_autodis.config b/examples/configs/dlrm_on_criteo_with_autodis.config
index 151bb4424..c6f522f95 100644
--- a/examples/configs/dlrm_on_criteo_with_autodis.config
+++ b/examples/configs/dlrm_on_criteo_with_autodis.config
@@ -473,6 +473,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'DLRM with autodis'
   model_class: 'RankModel'
   feature_groups: {
     group_name: "dense"
@@ -525,7 +526,7 @@ model_config: {
     blocks {
       name: 'num_emb'
       inputs {
-        name: 'dense'
+        feature_group_name: 'dense'
       }
       keras_layer {
         class_name: 'AutoDisEmbedding'
@@ -539,6 +540,9 @@ model_config: {
     }
     blocks {
       name: 'sparse'
+      inputs {
+        feature_group_name: 'sparse'
+      }
       input_layer {
         output_2d_tensor_and_feature_list: true
       }
@@ -546,11 +550,11 @@ model_config: {
     blocks {
       name: 'dot'
       inputs {
-        name: 'num_emb'
+        block_name: 'num_emb'
         input_fn: 'lambda x: x[1]'
       }
       inputs {
-        name: 'sparse'
+        block_name: 'sparse'
         input_fn: 'lambda x: x[1]'
       }
       keras_layer {
@@ -560,14 +564,14 @@ model_config: {
     blocks {
       name: 'sparse_2d'
       inputs {
-        name: 'sparse'
+        block_name: 'sparse'
         input_fn: 'lambda x: x[0]'
       }
     }
     blocks {
       name: 'num_emb_2d'
       inputs {
-        name: 'num_emb'
+        block_name: 'num_emb'
         input_fn: 'lambda x: x[0]'
       }
     }
diff --git a/examples/configs/dlrm_on_criteo_with_periodic.config b/examples/configs/dlrm_on_criteo_with_periodic.config
index 81d0db1b3..c42e8252b 100644
--- a/examples/configs/dlrm_on_criteo_with_periodic.config
+++ b/examples/configs/dlrm_on_criteo_with_periodic.config
@@ -473,6 +473,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'dlrm with periodic'
   model_class: 'RankModel'
   feature_groups: {
     group_name: "dense"
@@ -525,7 +526,7 @@ model_config: {
     blocks {
       name: 'num_emb'
       inputs {
-        name: 'dense'
+        feature_group_name: 'dense'
       }
       keras_layer {
         class_name: 'PeriodicEmbedding'
@@ -547,6 +548,9 @@ model_config: {
     }
     blocks {
       name: 'sparse'
+      inputs {
+        feature_group_name: 'sparse'
+      }
       input_layer {
         output_2d_tensor_and_feature_list: true
       }
@@ -554,11 +558,11 @@ model_config: {
     blocks {
       name: 'dot'
       inputs {
-        name: 'num_emb'
+        block_name: 'num_emb'
         input_fn: 'lambda x: x[1]'
       }
       inputs {
-        name: 'sparse'
+        block_name: 'sparse'
         input_fn: 'lambda x: x[1]'
       }
       keras_layer {
@@ -568,14 +572,14 @@ model_config: {
     blocks {
       name: 'sparse_2d'
       inputs {
-        name: 'sparse'
+        block_name: 'sparse'
         input_fn: 'lambda x: x[0]'
       }
     }
     blocks {
       name: 'num_emb_2d'
       inputs {
-        name: 'num_emb'
+        block_name: 'num_emb'
         input_fn: 'lambda x: x[0]'
       }
     }
diff --git a/examples/configs/dlrm_standard_on_criteo.config b/examples/configs/dlrm_standard_on_criteo.config
index 03e3df7bc..df82e7990 100644
--- a/examples/configs/dlrm_standard_on_criteo.config
+++ b/examples/configs/dlrm_standard_on_criteo.config
@@ -473,6 +473,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'Stardard DLRM'
   model_class: 'RankModel'
   feature_groups: {
     group_name: "dense"
@@ -525,7 +526,7 @@ model_config: {
     blocks {
       name: 'bottom_mlp'
       inputs {
-        name: 'dense'
+        feature_group_name: 'dense'
       }
       keras_layer {
         class_name: 'MLP'
@@ -536,6 +537,9 @@ model_config: {
     }
     blocks {
       name: 'sparse'
+      inputs {
+        feature_group_name: 'sparse'
+      }
       input_layer {
         only_output_feature_list: true
       }
@@ -543,11 +547,11 @@ model_config: {
     blocks {
       name: 'dot'
       inputs {
-        name: 'bottom_mlp'
+        block_name: 'bottom_mlp'
         input_fn: 'lambda x: [x]'
       }
       inputs {
-        name: 'sparse'
+        block_name: 'sparse'
       }
       keras_layer {
         class_name: 'DotInteraction'
diff --git a/examples/configs/fibinet_on_movielens.config b/examples/configs/fibinet_on_movielens.config
index aa6bef7f0..1fe36aac3 100644
--- a/examples/configs/fibinet_on_movielens.config
+++ b/examples/configs/fibinet_on_movielens.config
@@ -148,6 +148,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'FiBiNet'
   model_class: 'RankModel'
   feature_groups: {
     group_name: 'all'
@@ -163,6 +164,9 @@ model_config: {
   backbone {
     blocks {
       name: "all"
+      inputs {
+        feature_group_name: "all"
+      }
       input_layer {
         do_batch_norm: true
         only_output_feature_list: true
@@ -171,7 +175,7 @@ model_config: {
     blocks {
       name: "fibinet"
       inputs {
-        name: "all"
+        block_name: "all"
       }
       keras_layer {
         class_name: 'FiBiNet'
diff --git a/examples/configs/masknet_on_movielens.config b/examples/configs/masknet_on_movielens.config
index dccbbb13e..fd3dc1342 100644
--- a/examples/configs/masknet_on_movielens.config
+++ b/examples/configs/masknet_on_movielens.config
@@ -148,6 +148,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: 'MaskNet'
   model_class: 'RankModel'
   feature_groups: {
     group_name: 'all'
@@ -164,7 +165,7 @@ model_config: {
     blocks {
       name: "mask_net"
       inputs {
-        name: "all"
+        feature_group_name: "all"
       }
       keras_layer {
         class_name: 'MaskNet'
diff --git a/examples/configs/mlp_on_movielens.config b/examples/configs/mlp_on_movielens.config
index 392f392ef..038b02a51 100644
--- a/examples/configs/mlp_on_movielens.config
+++ b/examples/configs/mlp_on_movielens.config
@@ -164,7 +164,7 @@ model_config: {
     blocks {
       name: 'mlp'
       inputs {
-        name: 'features'
+        feature_group_name: 'features'
       }
       layers {
         keras_layer {
diff --git a/examples/configs/multi_tower_on_movielens.config b/examples/configs/multi_tower_on_movielens.config
new file mode 100644
index 000000000..472443cfa
--- /dev/null
+++ b/examples/configs/multi_tower_on_movielens.config
@@ -0,0 +1,223 @@
+train_input_path: "examples/data/movielens_1m/movies_train_data"
+eval_input_path: "examples/data/movielens_1m/movies_test_data"
+model_dir: "examples/ckpt/multi_tower_movieslen"
+
+train_config {
+  log_step_count_steps: 100
+  optimizer_config: {
+    adam_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.001
+          decay_steps: 1000
+          decay_factor: 0.5
+          min_learning_rate: 0.00001
+        }
+      }
+    }
+    use_moving_average: false
+  }
+  save_checkpoints_steps: 2000
+  sync_replicas: True
+}
+
+eval_config {
+  metrics_set: {
+    auc {}
+  }
+  metrics_set: {
+    gauc {
+      uid_field: 'user_id'
+    }
+  }
+  metrics_set: {
+    max_f1 {}
+  }
+}
+
+data_config {
+  input_fields {
+    input_name:'label'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'user_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'movie_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name:'rating'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'gender'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'age'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'job_id'
+    input_type: INT32
+  }
+  input_fields {
+    input_name: 'zip_id'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'title'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'genres'
+    input_type: STRING
+  }
+  input_fields {
+    input_name: 'year'
+    input_type: INT32
+  }
+
+  label_fields: 'label'
+  batch_size: 1024
+  num_epochs: 1
+  prefetch_size: 32
+  input_type: CSVInput
+  separator: '\t'
+}
+
+feature_config: {
+  features: {
+    input_names: 'user_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 12000
+  }
+  features: {
+    input_names: 'movie_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    hash_bucket_size: 6000
+  }
+  features: {
+    input_names: 'gender'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 2
+  }
+  features: {
+    input_names: 'job_id'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 21
+  }
+  features: {
+    input_names: 'age'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 7
+  }
+  features: {
+    input_names: 'genres'
+    feature_type: TagFeature
+    separator: '|'
+    embedding_dim: 16
+    hash_bucket_size: 100
+  }
+  features: {
+    input_names: 'title'
+    feature_type: SequenceFeature
+    separator: ' '
+    embedding_dim: 16
+    hash_bucket_size: 10000
+    sequence_combiner: {
+      text_cnn: {
+        filter_sizes: [2, 3, 4]
+        num_filters: [16, 8, 8]
+      }
+    }
+  }
+  features: {
+    input_names: 'year'
+    feature_type: IdFeature
+    embedding_dim: 16
+    num_buckets: 36
+  }
+}
+model_config: {
+  model_class: "RankModel"
+  feature_groups: {
+    group_name: 'user'
+    feature_names: 'user_id'
+    feature_names: 'job_id'
+    feature_names: 'age'
+    feature_names: 'gender'
+    wide_deep: DEEP
+  }
+  feature_groups: {
+    group_name: 'item'
+    feature_names: 'movie_id'
+    feature_names: 'year'
+    feature_names: 'genres'
+    wide_deep: DEEP
+  }
+  backbone {
+    packages {
+      name: 'user'
+      blocks {
+        name: 'mlp'
+        inputs {
+          feature_group_name: 'user'
+        }
+        keras_layer {
+          class_name: 'MLP'
+          mlp {
+            hidden_units: [256, 128]
+          }
+        }
+      }
+      concat_blocks: 'mlp'
+    }
+    packages {
+      name: 'item'
+      blocks {
+        name: 'mlp'
+        inputs {
+          feature_group_name: 'item'
+        }
+        keras_layer {
+          class_name: 'MLP'
+          mlp {
+            hidden_units: [256, 128]
+          }
+        }
+      }
+      concat_blocks: 'mlp'
+    }
+    blocks {
+      name: 'top_mlp'
+      inputs {
+        package_name: 'user'
+      }
+      inputs {
+        package_name: 'item'
+      }
+      layers {
+        keras_layer {
+          class_name: 'MLP'
+          mlp {
+            hidden_units: [128, 64]
+          }
+        }
+      }
+    }
+    concat_blocks: 'top_mlp'
+  }
+  rank_model {
+    l2_regularization: 1e-4
+  }
+  embedding_regularization: 1e-4
+}
diff --git a/examples/configs/wide_and_deep_backbone_on_movielens.config b/examples/configs/wide_and_deep_backbone_on_movielens.config
index dddc91888..0f13a0511 100644
--- a/examples/configs/wide_and_deep_backbone_on_movielens.config
+++ b/examples/configs/wide_and_deep_backbone_on_movielens.config
@@ -174,6 +174,9 @@ model_config: {
   backbone {
     blocks {
       name: 'wide'
+      inputs {
+        feature_group_name: 'wide'
+      }
       input_layer {
         only_output_feature_list: true
       }
@@ -181,7 +184,7 @@ model_config: {
     blocks {
       name: 'deep_logit'
       inputs {
-        name: 'deep'
+        feature_group_name: 'deep'
       }
       keras_layer {
         class_name: 'MLP'
@@ -195,11 +198,11 @@ model_config: {
     blocks {
       name: 'final_logit'
       inputs {
-        name: 'wide'
+        block_name: 'wide'
         input_fn: 'lambda x: tf.add_n(x)'
       }
       inputs {
-        name: 'deep_logit'
+        block_name: 'deep_logit'
       }
       merge_inputs_into_list: true
       keras_layer {

From ee49dbe9446d4a715982f636400c44c1a5271345 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Wed, 28 Jun 2023 15:47:54 +0800
Subject: [PATCH 49/54] add block package for reuse sub network

---
 examples/configs/multi_tower_on_movielens.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/configs/multi_tower_on_movielens.config b/examples/configs/multi_tower_on_movielens.config
index 472443cfa..a502922ae 100644
--- a/examples/configs/multi_tower_on_movielens.config
+++ b/examples/configs/multi_tower_on_movielens.config
@@ -148,6 +148,7 @@ feature_config: {
   }
 }
 model_config: {
+  model_name: "multi tower"
   model_class: "RankModel"
   feature_groups: {
     group_name: 'user'

From 2c591fc523fabd048e1245180f7554ae8b1add98 Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 30 Jun 2023 10:51:38 +0800
Subject: [PATCH 50/54] fix a bug

---
 easy_rec/python/layers/backbone.py    | 68 ++++++++++++++++-----------
 easy_rec/python/protos/backbone.proto | 10 ++--
 2 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 414c667fb..92705e3d4 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -43,8 +43,8 @@ def __init__(self, config, features, input_layer, l2_reg=None):
         name = one_input.WhichOneof('name')
         if name != 'feature_group_name':
           raise KeyError(
-            '`feature_group_name` should be set for input layer: ' +
-            block.name)
+              '`feature_group_name` should be set for input layer: ' +
+              block.name)
         input_name = one_input.feature_group_name
         if not input_layer.has_group(input_name):
           raise KeyError('invalid feature group name: ' + input_name)
@@ -57,6 +57,7 @@ def __init__(self, config, features, input_layer, l2_reg=None):
     num_blocks = len(self._name_to_blocks) - num_groups
     assert num_blocks > 0, 'there must be at least one block in backbone'
 
+    num_pkg_input = 0
     for block in config.blocks:
       layer = block.WhichOneof('layer')
       if layer == 'input_layer':
@@ -65,7 +66,11 @@ def __init__(self, config, features, input_layer, l2_reg=None):
         raise KeyError('block name can not be one of feature groups:' +
                        block.name)
       for input_node in block.inputs:
-        input_name = getattr(input_node, input_node.WhichOneof('name'))
+        input_type = input_node.WhichOneof('name')
+        if input_type == 'package_name':
+          num_pkg_input += 1
+          continue
+        input_name = getattr(input_node, input_type)
         if input_name in self._name_to_blocks:
           assert input_name != block.name, 'input name can not equal to block name:' + input_name
           self._dag.add_edge(input_name, block.name)
@@ -84,10 +89,16 @@ def __init__(self, config, features, input_layer, l2_reg=None):
             input_feature_groups.add(input_name)
           else:
             raise KeyError(
-              'invalid input name `%s`, must be the name of either a feature group or an another block'
-              % input_name)
+                'invalid input name `%s`, must be the name of either a feature group or an another block'
+                % input_name)
     num_groups = len(input_feature_groups)
-    assert num_groups > 0, 'there must be at least one input layer'
+    assert num_pkg_input > 0 or num_groups > 0, 'there must be at least one input layer/feature group'
+
+    if len(config.concat_blocks) == 0:
+      leaf = self._dag.all_leaves()
+      logging.warning("%s has no `concat_blocks`, try to use all leaf blocks: %s" % (config.name, ','.join(leaf)))
+      self._config.concat_blocks.extend(leaf)
+
     Package.__packages[self._config.name] = self
 
   def block_input(self, config, block_outputs, training=None):
@@ -106,6 +117,10 @@ def block_input(self, config, block_outputs, training=None):
         input_feature = block_outputs[input_name]
       else:
         raise KeyError('input name `%s` does not exists' % input_name)
+
+      if input_node.HasField('input_slice'):
+        fn = 'lambda x: x' + input_node.input_slice.strip()
+        input_feature = fn(input_feature)
       if input_node.HasField('input_fn'):
         fn = eval(input_node.input_fn)
         input_feature = fn(input_feature)
@@ -114,7 +129,7 @@ def block_input(self, config, block_outputs, training=None):
     if config.merge_inputs_into_list:
       output = inputs
     else:
-      output = concat_inputs(inputs, config.input_concat_axis, config.name)
+      output = merge_inputs(inputs, config.input_concat_axis, config.name)
 
     if config.HasField('extra_input_fn'):
       fn = eval(config.extra_input_fn)
@@ -142,7 +157,8 @@ def call(self, is_training):
       # just one of layer
       layer = config.WhichOneof('layer')
       if layer is None:  # identity layer
-        block_outputs[block] = self.block_input(config, block_outputs, is_training)
+        block_outputs[block] = self.block_input(config, block_outputs,
+                                                is_training)
       elif layer == 'input_layer':
         conf = config.input_layer
         input_fn = EnhancedInputLayer(conf, self._input_layer, self._features)
@@ -163,13 +179,7 @@ def call(self, is_training):
           outputs.append(temp)
       else:
         raise ValueError('No output `%s` of backbone to be concat' % output)
-    output = concat_inputs(outputs, msg='backbone')
-
-    # if self._config.HasField('top_mlp'):
-    #   params = Parameter.make_from_pb(self._config.top_mlp)
-    #   params.l2_regularizer = self._l2_reg
-    #   final_mlp = MLP(params, name='backbone_top_mlp')
-    #   output = final_mlp(output, training=is_training)
+    output = merge_inputs(outputs, msg='backbone')
     return output
 
   def call_keras_layer(self, layer_conf, inputs, name, training):
@@ -294,20 +304,24 @@ def __call__(self, is_training, **kwargs):
     return output
 
 
-def concat_inputs(inputs, axis=-1, msg=''):
-  if len(inputs) > 1:
-    if all(map(lambda x: type(x) == list, inputs)):
-      # merge multiple lists into a list
-      from functools import reduce
-      return reduce(lambda x, y: x + y, inputs)
-
-    if axis != -1:
-      logging.info('concat inputs %s axis=%d' % (msg, axis))
-    return tf.concat(inputs, axis=axis)
-
+def merge_inputs(inputs, axis=-1, msg=''):
+  if len(inputs) == 0:
+    raise ValueError('no inputs to be concat:' + msg)
   if len(inputs) == 1:
     return inputs[0]
-  raise ValueError('no inputs to be concat:' + msg)
+
+  from functools import reduce
+  if all(map(lambda x: type(x) == list, inputs)):
+    # merge multiple lists into a list
+    return reduce(lambda x, y: x + y, inputs)
+
+  if any(map(lambda x: type(x) == list, inputs)):
+    logging.warning('%s: try to merge inputs into list' % msg)
+    return reduce(lambda x, y: x + y, [e if type(e) == list else [e] for e in inputs])
+
+  if axis != -1:
+    logging.info('concat inputs %s axis=%d' % (msg, axis))
+  return tf.concat(inputs, axis=axis)
 
 
 def format_value(value):
diff --git a/easy_rec/python/protos/backbone.proto b/easy_rec/python/protos/backbone.proto
index d73799707..67b230c04 100644
--- a/easy_rec/python/protos/backbone.proto
+++ b/easy_rec/python/protos/backbone.proto
@@ -20,12 +20,13 @@ message Lambda {
 }
 
 message Input {
-    optional string input_fn = 1;
     oneof name {
-        string feature_group_name = 2;
-        string block_name = 3;
-        string package_name = 4;
+        string feature_group_name = 1;
+        string block_name = 2;
+        string package_name = 3;
     }
+    optional string input_fn = 11;
+    optional string input_slice = 12;
 }
 
 message RecurrentLayer {
@@ -47,6 +48,7 @@ message Layer {
         KerasLayer keras_layer = 2;
         RecurrentLayer recurrent = 3;
         RepeatLayer repeat = 4;
+        InputLayer input_layer = 5;
     }
 }
 

From a1f0b8af0527a0cc7fc6df626ec9eb1e1a699d8b Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 30 Jun 2023 11:32:54 +0800
Subject: [PATCH 51/54] fix bug of input layer block

---
 easy_rec/python/layers/backbone.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 92705e3d4..b5782fb94 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -143,8 +143,8 @@ def __call__(self, is_training, **kwargs):
   def call(self, is_training):
     block_outputs = {}
     blocks = self._dag.topological_sort()
-    logging.info('backbone topological order: ' + ','.join(blocks))
-    print('backbone topological order: ' + ','.join(blocks))
+    logging.info(self._config.name + ' topological order: ' + ','.join(blocks))
+    print(self._config.name + ' topological order: ' + ','.join(blocks))
     for block in blocks:
       config = self._name_to_blocks[block]
       if config.layers:  # sequential layers
@@ -162,7 +162,7 @@ def call(self, is_training):
       elif layer == 'input_layer':
         conf = config.input_layer
         input_fn = EnhancedInputLayer(conf, self._input_layer, self._features)
-        output = input_fn(block, is_training)
+        output = input_fn(config.feature_group_name, is_training)
         block_outputs[block] = output
       else:
         inputs = self.block_input(config, block_outputs, is_training)

From ef5f9cd1fb1fd937be4adbabfdb5c26f7a28e62e Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Fri, 30 Jun 2023 11:37:06 +0800
Subject: [PATCH 52/54] fix bug of input layer block

---
 easy_rec/python/layers/backbone.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index b5782fb94..0b16d92a1 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -162,7 +162,8 @@ def call(self, is_training):
       elif layer == 'input_layer':
         conf = config.input_layer
         input_fn = EnhancedInputLayer(conf, self._input_layer, self._features)
-        output = input_fn(config.feature_group_name, is_training)
+        feature_group = config.inputs[0].feature_group_name
+        output = input_fn(feature_group, is_training)
         block_outputs[block] = output
       else:
         inputs = self.block_input(config, block_outputs, is_training)

From a3944f9b452d675db4bbdfa2121b5ba1df7168ca Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Wed, 12 Jul 2023 10:32:24 +0800
Subject: [PATCH 53/54] fix bug of input layer block

---
 easy_rec/python/layers/backbone.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/easy_rec/python/layers/backbone.py b/easy_rec/python/layers/backbone.py
index 0b16d92a1..3093d9f8e 100644
--- a/easy_rec/python/layers/backbone.py
+++ b/easy_rec/python/layers/backbone.py
@@ -96,7 +96,9 @@ def __init__(self, config, features, input_layer, l2_reg=None):
 
     if len(config.concat_blocks) == 0:
       leaf = self._dag.all_leaves()
-      logging.warning("%s has no `concat_blocks`, try to use all leaf blocks: %s" % (config.name, ','.join(leaf)))
+      logging.warning(
+          '%s has no `concat_blocks`, try to use all leaf blocks: %s' %
+          (config.name, ','.join(leaf)))
       self._config.concat_blocks.extend(leaf)
 
     Package.__packages[self._config.name] = self
@@ -119,7 +121,7 @@ def block_input(self, config, block_outputs, training=None):
         raise KeyError('input name `%s` does not exists' % input_name)
 
       if input_node.HasField('input_slice'):
-        fn = 'lambda x: x' + input_node.input_slice.strip()
+        fn = eval('lambda x: x' + input_node.input_slice.strip())
         input_feature = fn(input_feature)
       if input_node.HasField('input_fn'):
         fn = eval(input_node.input_fn)
@@ -318,7 +320,8 @@ def merge_inputs(inputs, axis=-1, msg=''):
 
   if any(map(lambda x: type(x) == list, inputs)):
     logging.warning('%s: try to merge inputs into list' % msg)
-    return reduce(lambda x, y: x + y, [e if type(e) == list else [e] for e in inputs])
+    return reduce(lambda x, y: x + y,
+                  [e if type(e) == list else [e] for e in inputs])
 
   if axis != -1:
     logging.info('concat inputs %s axis=%d' % (msg, axis))

From faf8ddfce0c8730287ae9ca571dad39fa955e49e Mon Sep 17 00:00:00 2001
From: weisu <weisu.yxd@alibaba-inc.com>
Date: Wed, 12 Jul 2023 11:27:25 +0800
Subject: [PATCH 54/54] upgrade to new version

---
 easy_rec/python/model/dbmtl.py                | 29 --------
 easy_rec/python/model/easy_rec_model.py       | 49 --------------
 easy_rec/python/model/multi_task_model.py     | 66 +++++++++++++++++++
 easy_rec/python/protos/dbmtl.proto            |  5 --
 easy_rec/python/protos/easy_rec_model.proto   | 12 ++--
 easy_rec/python/protos/feature_config.proto   |  1 -
 easy_rec/python/protos/keras_layer.proto      |  1 +
 easy_rec/python/protos/layer.proto            |  9 +++
 easy_rec/python/protos/seq_encoder.proto      | 15 -----
 .../python/protos/variational_dropout.proto   |  2 +-
 10 files changed, 84 insertions(+), 105 deletions(-)

diff --git a/easy_rec/python/model/dbmtl.py b/easy_rec/python/model/dbmtl.py
index a1ebbf14b..e87ee9ae7 100644
--- a/easy_rec/python/model/dbmtl.py
+++ b/easy_rec/python/model/dbmtl.py
@@ -43,35 +43,6 @@ def __init__(self,
     self._init_towers(self._model_config.task_towers)
 
   def build_predict_graph(self):
-    # if self._model_config.use_self_supervised_learning:
-    #   bern = tf.distributions.Bernoulli(probs=0.5)
-    #   num_features = len(self._feature_list)
-    #   mask = bern.sample(num_features)
-    #   left_features, right_features = [], []
-    #   for i in range(num_features):
-    #     fea = self._feature_list[i]
-    #     zero = tf.zeros_like(fea)
-    #     left, right = tf.cond(
-    #         tf.equal(mask[i], 1), lambda: (fea, zero), lambda: (zero, fea))
-    #     left_features.append(left)
-    #     right_features.append(right)
-    #   left_feature = tf.concat(left_features, axis=-1)
-    #   right_feature = tf.concat(right_features, axis=-1)
-    #   if self._model_config.HasField('bottom_mask_net'):
-    #     left_encoding = self._mask_net_layer(
-    #         left_feature, self._is_training, l2_reg=self._l2_reg)
-    #     right_encoding = self._mask_net_layer(
-    #         right_feature, self._is_training, l2_reg=self._l2_reg)
-    #   else:
-    #     raise ValueError(
-    #         'Unsupported bottom layer when use self supervised learning')
-    #
-    #   loss = info_nce_loss(
-    #       left_encoding,
-    #       right_encoding,
-    #       temperature=self._model_config.ssl_loss_temperature)
-    #   self._loss_dict['ssl_loss'] = loss * self._model_config.ssl_loss_weight
-
     bottom_fea = self.backbone
     if bottom_fea is None:
       if self._model_config.HasField('bottom_cmbf'):
diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py
index e385ac9a2..522d3632e 100644
--- a/easy_rec/python/model/easy_rec_model.py
+++ b/easy_rec/python/model/easy_rec_model.py
@@ -62,10 +62,6 @@ def __init__(self,
     if constant.SAMPLE_WEIGHT in features:
       self._sample_weight = features[constant.SAMPLE_WEIGHT]
 
-    # self._sequence_encoder = SequenceEncoder(self._input_layer, feature_configs,
-    #                                          model_config.feature_groups,
-    #                                          self._l2_reg)
-    # self._sequence_encoding_by_group_name = {}
     self._backbone_output = None
     if model_config.HasField('backbone'):
       self._backbone = Backbone(
@@ -138,51 +134,6 @@ def build_input_layer(self, model_config, feature_configs):
         is_training=self._is_training,
         is_predicting=self._is_predicting)
 
-  # def get_sequence_encoding(self, group_name=None, is_training=True):
-  #   if group_name is not None:
-  #     if group_name in self._sequence_encoding_by_group_name:
-  #       return self._sequence_encoding_by_group_name[group_name]
-  #     encoding = self._sequence_encoder(
-  #         self._feature_dict,
-  #         group_name,
-  #         is_training,
-  #         loss_dict=self._loss_dict)
-  #     self._sequence_encoding_by_group_name[group_name] = encoding
-  #     return encoding
-  #
-  #   seq_encoding = []
-  #   for group in self.feature_groups:
-  #     if len(group.sequence_encoders) == 0:
-  #       continue
-  #     group_name = group.group_name
-  #     if group_name in self._sequence_encoding_by_group_name:
-  #       encoding = self._sequence_encoding_by_group_name[group_name]
-  #     else:
-  #       encoding = self._sequence_encoder(
-  #           self._feature_dict,
-  #           group_name,
-  #           is_training,
-  #           loss_dict=self._loss_dict)
-  #       self._sequence_encoding_by_group_name[group_name] = encoding
-  #     if encoding is not None:
-  #       seq_encoding.append(encoding)
-  #
-  #   if len(seq_encoding) > 1:
-  #     encoding = tf.concat(seq_encoding, axis=-1)
-  #   elif len(seq_encoding) == 1:
-  #     encoding = seq_encoding[0]
-  #   else:
-  #     return None
-  #
-  #   # if self._base_model_config.HasField('sequence_dnn'):
-  #   #   sequence_dnn = dnn.DNN(
-  #   #       self._base_model_config.sequence_dnn,
-  #   #       self._l2_reg,
-  #   #       name='sequence_dnn',
-  #   #       is_training=self._is_training)
-  #   #   encoding = sequence_dnn(encoding)
-  #   return encoding
-
   @abstractmethod
   def build_predict_graph(self):
     pass
diff --git a/easy_rec/python/model/multi_task_model.py b/easy_rec/python/model/multi_task_model.py
index 06dc53f8a..21e8f2c55 100644
--- a/easy_rec/python/model/multi_task_model.py
+++ b/easy_rec/python/model/multi_task_model.py
@@ -5,6 +5,7 @@
 import tensorflow as tf
 
 from easy_rec.python.builders import loss_builder
+from easy_rec.python.layers.dnn import DNN
 from easy_rec.python.model.rank_model import RankModel
 from easy_rec.python.protos import tower_pb2
 from easy_rec.python.protos.loss_pb2 import LossType
@@ -27,6 +28,71 @@ def __init__(self,
     self._task_num = None
     self._label_name_dict = {}
 
+  def build_predict_graph(self):
+    if not self.has_backbone:
+      raise NotImplementedError(
+          'method `build_predict_graph` must be implemented when backbone network do not exits'
+      )
+    model = self._model_config.WhichOneof('model')
+    assert model == 'model_params', '`model_params` must be configured'
+    config = self._model_config.model_params
+
+    self._init_towers(config.task_towers)
+
+    backbone = self.backbone
+    if type(backbone) in (list, tuple):
+      if len(backbone) != len(config.task_towers):
+        raise ValueError(
+            'The number of backbone outputs and task towers must be equal')
+      task_input_list = backbone
+    else:
+      task_input_list = [backbone] * len(config.task_towers)
+
+    tower_features = {}
+    for i, task_tower_cfg in enumerate(config.task_towers):
+      tower_name = task_tower_cfg.tower_name
+      if task_tower_cfg.HasField('dnn'):
+        tower_dnn = DNN(
+            task_tower_cfg.dnn,
+            self._l2_reg,
+            name=tower_name,
+            is_training=self._is_training)
+        tower_output = tower_dnn(task_input_list[i])
+      else:
+        tower_output = task_input_list[i]
+      tower_features[tower_name] = tower_output
+
+    tower_outputs = {}
+    relation_features = {}
+    # bayes network
+    for task_tower_cfg in config.task_towers:
+      tower_name = task_tower_cfg.tower_name
+      if task_tower_cfg.HasField('relation_dnn'):
+        relation_dnn = DNN(
+            task_tower_cfg.relation_dnn,
+            self._l2_reg,
+            name=tower_name + '/relation_dnn',
+            is_training=self._is_training)
+        tower_inputs = [tower_features[tower_name]]
+        for relation_tower_name in task_tower_cfg.relation_tower_names:
+          tower_inputs.append(relation_features[relation_tower_name])
+        relation_input = tf.concat(
+            tower_inputs, axis=-1, name=tower_name + '/relation_input')
+        relation_fea = relation_dnn(relation_input)
+        relation_features[tower_name] = relation_fea
+      else:
+        relation_fea = tower_features[tower_name]
+
+      output_logits = tf.layers.dense(
+          relation_fea,
+          task_tower_cfg.num_class,
+          kernel_regularizer=self._l2_reg,
+          name=tower_name + '/output')
+      tower_outputs[tower_name] = output_logits
+
+    self._add_to_prediction_dict(tower_outputs)
+    return self._prediction_dict
+
   def _init_towers(self, task_tower_configs):
     """Init task towers."""
     self._task_towers = task_tower_configs
diff --git a/easy_rec/python/protos/dbmtl.proto b/easy_rec/python/protos/dbmtl.proto
index a9c4a2e74..9adff1f62 100644
--- a/easy_rec/python/protos/dbmtl.proto
+++ b/easy_rec/python/protos/dbmtl.proto
@@ -21,9 +21,4 @@ message DBMTL {
     repeated BayesTaskTower task_towers = 4;
     // l2 regularization
     optional float l2_regularization = 5 [default = 1e-4];
-
-    // Whether to use self supervised learning
-    required bool use_self_supervised_learning = 9 [default = false];
-    optional float ssl_loss_weight = 10 [default = 1.0];
-    optional float ssl_loss_temperature = 11 [default = 0.1];
 }
diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto
index 21ac685d3..1e926c368 100644
--- a/easy_rec/python/protos/easy_rec_model.proto
+++ b/easy_rec/python/protos/easy_rec_model.proto
@@ -25,16 +25,17 @@ import "easy_rec/python/protos/loss.proto";
 import "easy_rec/python/protos/rocket_launching.proto";
 import "easy_rec/python/protos/variational_dropout.proto";
 import "easy_rec/python/protos/multi_tower_recall.proto";
+import "easy_rec/python/protos/tower.proto";
 
 // for input performance test
 message DummyModel {
 }
 
-// configure backbone network in a free style way
-message RankModel {
+// configure backbone network common parameters
+message ModelParams {
   optional float l2_regularization = 1;
   optional uint32 wide_output_dim = 2;
-  // optional bool add_head_logits_layer = 3 [default=true];
+  repeated BayesTaskTower task_towers = 3;
 }
 
 // for knowledge distillation
@@ -55,15 +56,16 @@ message KD {
 }
 
 message EasyRecModel {
-    required string model_name = 99;
     required string model_class = 1;
+    // just a name for backbone config
+    optional string model_name = 99;
 
     // actually input layers, each layer produce a group of feature
     repeated FeatureGroupConfig feature_groups = 2;
 
     // model parameters
     oneof model {
-        RankModel rank_model = 100;
+        ModelParams model_params = 100;
         DummyModel dummy = 101;
         WideAndDeep wide_and_deep = 102;
         DeepFM deepfm = 103;
diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto
index 75d49a15c..e05e73753 100644
--- a/easy_rec/python/protos/feature_config.proto
+++ b/easy_rec/python/protos/feature_config.proto
@@ -144,7 +144,6 @@ message FeatureGroupConfig {
     optional WideOrDeep wide_deep = 3 [default = DEEP];
     repeated SeqAttGroupConfig sequence_features = 4;
     optional bool negative_sampler = 5 [default = false];
-    // repeated SequenceEncoder sequence_encoders = 6;
 }
 
 message SeqAttMap {
diff --git a/easy_rec/python/protos/keras_layer.proto b/easy_rec/python/protos/keras_layer.proto
index 94a3ba801..2798260d3 100644
--- a/easy_rec/python/protos/keras_layer.proto
+++ b/easy_rec/python/protos/keras_layer.proto
@@ -22,5 +22,6 @@ message KerasLayer {
         MLP mlp = 11;
         DINEncoder din = 12;
         BSTEncoder bst = 13;
+        MMoELayer mmoe = 14;
     }
 }
diff --git a/easy_rec/python/protos/layer.proto b/easy_rec/python/protos/layer.proto
index c7349c2ac..52a1cbf30 100644
--- a/easy_rec/python/protos/layer.proto
+++ b/easy_rec/python/protos/layer.proto
@@ -60,3 +60,12 @@ message MaskNet {
     required bool use_parallel = 2 [default = true];
     optional MLP mlp = 3;
 }
+
+message MMoELayer {
+    // number of tasks
+    required uint32 num_task = 1;
+    // mmoe expert mlp layer definition
+    optional MLP expert_mlp = 2;
+    // number of mmoe experts
+    optional uint32 num_expert = 3;
+}
diff --git a/easy_rec/python/protos/seq_encoder.proto b/easy_rec/python/protos/seq_encoder.proto
index f02490238..2b845a429 100644
--- a/easy_rec/python/protos/seq_encoder.proto
+++ b/easy_rec/python/protos/seq_encoder.proto
@@ -4,15 +4,6 @@ package protos;
 import "easy_rec/python/protos/dnn.proto";
 
 
-message SequenceEncoder {
-    // encoder parameters
-    oneof encoder {
-        BSTEncoder bst = 101;
-        DINEncoder din = 102;
-    }
-    required bool force_share_embeddings = 1 [default = true];
-}
-
 message BSTEncoder {
     // Size of the encoder layers and the pooler layer
     required uint32 hidden_size = 1;
@@ -34,12 +25,6 @@ message BSTEncoder {
     required bool use_position_embeddings = 9 [default = true];
     // The stddev of the truncated_normal_initializer for initializing all weight matrices
     required float initializer_range = 10 [default = 0.02];
-    // need contrastive learning
-    required bool need_contrastive_learning = 11 [default = false];
-    // the weight of contrastive learning loss
-    optional float contrastive_loss_weight = 12 [default = 1.0];
-    // whether need auto learn contrastive loss weight
-    optional bool auto_contrastive_loss_weight = 13 [default = false];
 }
 
 message DINEncoder {
diff --git a/easy_rec/python/protos/variational_dropout.proto b/easy_rec/python/protos/variational_dropout.proto
index e76a0fb3b..a1bb39974 100644
--- a/easy_rec/python/protos/variational_dropout.proto
+++ b/easy_rec/python/protos/variational_dropout.proto
@@ -2,7 +2,7 @@ syntax = "proto2";
 package protos;
 
 
-message VariationalDropoutLayer {
+message  VariationalDropoutLayer{
     // regularization coefficient lambda
     optional float regularization_lambda = 1 [default = 0.01];
     // variational_dropout dimension