diff --git a/Dockerfile b/Dockerfile
deleted file mode 100755
index 0624d87..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,76 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-devel
-
-RUN apt-get update && apt-get -y upgrade && \
-  apt-get install -y \
-  build-essential \
-  ca-certificates \
-  git \
-  libopenblas-dev \
-  libatlas-base-dev \
-  libglib2.0-dev \
-  libopencv-dev \
-  python-dev \
-  python-numpy \
-  python-setuptools \
-  wget \
-  cmake \
-  curl \
-  python-pip \
-  python-dev \
-  unzip \ 
-  sudo \
-  vim \
-  libglib2.0-dev \
-  libtiff5-dev \
-  libjpeg8-dev \
-  zlib1g-dev 
-
-RUN pip install --upgrade numpy scipy matplotlib scikit-learn sympy nltk setuptools requests
-
-COPY nan.patch /root
-
-# Build MxNet for Python
-RUN cd /root && git clone --recursive https://github.com/dmlc/mxnet.git && cp nan.patch /root/mxnet/ && \
-  cd mxnet && git checkout 955f6be6977ca1a27d3e912fd62a08f019dd1f76 && git apply nan.patch && \
-  cp make/config.mk . && \
-    echo "USE_CUDA=1" >> config.mk && \
-    echo "USE_CUDNN=1" >> config.mk && \
-    echo "CUDA_ARCH :=" \
-         "-gencode arch=compute_35,code=sm_35" \
-         "-gencode arch=compute_52,code=sm_52" \
-         "-gencode arch=compute_60,code=sm_60" \
-         "-gencode arch=compute_61,code=sm_61" \
-         "-gencode arch=compute_61,code=compute_61" >> config.mk && \
-    echo "USE_CUDA_PATH=/usr/local/cuda" >> config.mk 
-
-ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/lib
-RUN cd /root/mxnet && make -j$(nproc) && \
-    mv lib/libmxnet.so /usr/local/lib && \
-    ldconfig && \
-    make clean && \
-    cd python && \
-    pip install -e .
-
-# Python3 support
-RUN apt-get -y install python3-pip
-RUN pip3 install numpy
-
-# Jupyter notebook support
-COPY jupyter_notebook_config.py /root/.jupyter/jupyter_notebook_config.py
-EXPOSE 8888
-
-ENV PYTHONPATH /root/mxnet/python
-
-# Build MxNet for Scala
-#RUN apt-get -y install maven openjdk-8-jdk scala
-#RUN cd /root/mxnet && make scalapkg && make scalainstall
-
-# Build MxNet for R - WIP !!!
-#RUN apt-get -y install r-base r-base-dev
-
-RUN pip install unidecode dill tqdm
-
-WORKDIR /root/mxnet
-
-
-
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/seq2seq/README.md b/README.md
similarity index 74%
rename from seq2seq/README.md
rename to README.md
index 618e0bb..bbf30ab 100644
--- a/seq2seq/README.md
+++ b/README.md
@@ -1,11 +1,10 @@
-OpenNMT seq2seq model
-==========================
+LSTM encoder-decoder seq2seq model
+==================================
 
-This project is an implementation of the [OpenNMT sequence-to-sequence model](http://opennmt.net/Models/) in MxNet. The OpenNMT model is based on:
+This project is an implementation of a simple encoder-decoder seq2seq model in MxNet. The OpenNMT model is based on:
 
 - a stacked LSTM encoder
 - a stacked LSTM decoder
-- an attention model
  
 The reference model configuration is:
 
@@ -38,8 +37,12 @@ How to run the scripts?
 Credits
 -------
 
-Thanks to Eric Xie (@piiswrong) for the attention cell implementation.
+Many thanks to Eric Xie (@piiswrong), Sheng Zha (@szha) and Antti-Pekka Hynninen (@ap-hynninen) for valuable input.
 
+License
+-------
+
+This project is licensed under the Apache 2.0 license. See the text of the license [here](https://github.com/mkolod/mxnet_seq2seq/blob/master/LICENSE.txt).
 
 > **Note:**
 
diff --git a/seq2seq/__init__.py b/__init__.py
similarity index 100%
rename from seq2seq/__init__.py
rename to __init__.py
diff --git a/seq2seq/attention_cell.py b/attention_cell.py
similarity index 100%
rename from seq2seq/attention_cell.py
rename to attention_cell.py
diff --git a/seq2seq/get_en_es_nmt_data.sh b/get_en_es_nmt_data.sh
similarity index 100%
rename from seq2seq/get_en_es_nmt_data.sh
rename to get_en_es_nmt_data.sh
diff --git a/get_opennmt_data.sh b/get_opennmt_data.sh
new file mode 100755
index 0000000..69e07aa
--- /dev/null
+++ b/get_opennmt_data.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+DATA_DIR_ROOT="./data"
+DATA_DIR="${DATA_DIR_ROOT}/wmt15-de-en"
+
+mkdir -p ${DATA_DIR}
+
+pushd . > /dev/null
+
+cd ${DATA_DIR_ROOT}
+
+echo -e "\nDownloading dataset"
+
+wget https://s3.amazonaws.com/opennmt-trainingdata/wmt15-de-en.tgz
+
+echo -e "\nDecompressing dataset\n"
+
+tar xvf wmt15-de-en.tgz
+
+echo -e "\nConcatenating corpora"
+
+cd wmt15-de-en
+
+# concatenate corpora - note concatenation has to be in 
+# the same order for both languages
+
+# we will split this into training and validation sets
+cat commoncrawl.de-en.de europarl-v7.de-en.de news-commentary-v10.de-en.de > train.de
+# the test set already officially exists
+mv newstest2013.de valid.de
+
+wget -O test.de https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.de 
+
+# do the same thing to English corpora
+
+cat commoncrawl.de-en.en europarl-v7.de-en.en news-commentary-v10.de-en.en > train.en
+mv newstest2013.en valid.en
+
+wget -O test.en https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.en
+
+popd > /dev/null
+
+echo -e "\nData download complete\n"
diff --git a/jupyter_notebook_config.py b/jupyter_notebook_config.py
deleted file mode 100755
index 646be37..0000000
--- a/jupyter_notebook_config.py
+++ /dev/null
@@ -1,584 +0,0 @@
-# Configuration file for jupyter-notebook.
-
-#------------------------------------------------------------------------------
-# Application(SingletonConfigurable) configuration
-#------------------------------------------------------------------------------
-
-## This is an application.
-
-## The date format used by logging formatters for %(asctime)s
-#c.Application.log_datefmt = '%Y-%m-%d %H:%M:%S'
-
-## The Logging format template
-#c.Application.log_format = '[%(name)s]%(highlevel)s %(message)s'
-
-## Set the log level by value or name.
-#c.Application.log_level = 30
-
-#------------------------------------------------------------------------------
-# JupyterApp(Application) configuration
-#------------------------------------------------------------------------------
-
-## Base class for Jupyter applications
-
-## Answer yes to any prompts.
-#c.JupyterApp.answer_yes = False
-
-## Full path of a config file.
-#c.JupyterApp.config_file = u''
-
-## Specify a config file to load.
-#c.JupyterApp.config_file_name = u''
-
-## Generate default config file.
-#c.JupyterApp.generate_config = False
-
-#------------------------------------------------------------------------------
-# NotebookApp(JupyterApp) configuration
-#------------------------------------------------------------------------------
-
-## Set the Access-Control-Allow-Credentials: true header
-#c.NotebookApp.allow_credentials = False
-
-## Set the Access-Control-Allow-Origin header
-#  
-#  Use '*' to allow any origin to access your server.
-#  
-#  Takes precedence over allow_origin_pat.
-#c.NotebookApp.allow_origin = ''
-
-## Use a regular expression for the Access-Control-Allow-Origin header
-#  
-#  Requests from an origin matching the expression will get replies with:
-#  
-#      Access-Control-Allow-Origin: origin
-#  
-#  where `origin` is the origin of the request.
-#  
-#  Ignored if allow_origin is set.
-#c.NotebookApp.allow_origin_pat = ''
-
-## DEPRECATED use base_url
-#c.NotebookApp.base_project_url = '/'
-
-## The base URL for the notebook server.
-#  
-#  Leading and trailing slashes can be omitted, and will automatically be added.
-#c.NotebookApp.base_url = '/'
-
-## Specify what command to use to invoke a web browser when opening the notebook.
-#  If not specified, the default browser will be determined by the `webbrowser`
-#  standard library module, which allows setting of the BROWSER environment
-#  variable to override it.
-#c.NotebookApp.browser = u''
-
-## The full path to an SSL/TLS certificate file.
-#c.NotebookApp.certfile = u''
-
-## The full path to a certificate authority certificate for SSL/TLS client
-#  authentication.
-#c.NotebookApp.client_ca = u''
-
-## The config manager class to use
-#c.NotebookApp.config_manager_class = 'notebook.services.config.manager.ConfigManager'
-
-## The notebook manager class to use.
-#c.NotebookApp.contents_manager_class = 'notebook.services.contents.filemanager.FileContentsManager'
-
-## Extra keyword arguments to pass to `set_secure_cookie`. See tornado's
-#  set_secure_cookie docs for details.
-#c.NotebookApp.cookie_options = {}
-
-## The random bytes used to secure cookies. By default this is a new random
-#  number every time you start the Notebook. Set it to a value in a config file
-#  to enable logins to persist across server sessions.
-#  
-#  Note: Cookie secrets should be kept private, do not share config files with
-#  cookie_secret stored in plaintext (you can read the value from a file).
-#c.NotebookApp.cookie_secret = ''
-
-## The file where the cookie secret is stored.
-#c.NotebookApp.cookie_secret_file = u''
-
-## The default URL to redirect to from `/`
-#c.NotebookApp.default_url = '/tree'
-
-## Disable cross-site-request-forgery protection
-#  
-#  Jupyter notebook 4.3.1 introduces protection from cross-site request
-#  forgeries, requiring API requests to either:
-#  
-#  - originate from pages served by this server (validated with XSRF cookie and
-#  token), or - authenticate with a token
-#  
-#  Some anonymous compute resources still desire the ability to run code,
-#  completely without authentication. These services can disable all
-#  authentication and security checks, with the full knowledge of what that
-#  implies.
-#c.NotebookApp.disable_check_xsrf = False
-
-## Whether to enable MathJax for typesetting math/TeX
-#  
-#  MathJax is the javascript library Jupyter uses to render math/LaTeX. It is
-#  very large, so you may want to disable it if you have a slow internet
-#  connection, or for offline use of the notebook.
-#  
-#  When disabled, equations etc. will appear as their untransformed TeX source.
-#c.NotebookApp.enable_mathjax = True
-
-## extra paths to look for Javascript notebook extensions
-#c.NotebookApp.extra_nbextensions_path = []
-
-## Extra paths to search for serving static files.
-#  
-#  This allows adding javascript/css to be available from the notebook server
-#  machine, or overriding individual files in the IPython
-#c.NotebookApp.extra_static_paths = []
-
-## Extra paths to search for serving jinja templates.
-#  
-#  Can be used to override templates from notebook.templates.
-#c.NotebookApp.extra_template_paths = []
-
-## 
-#c.NotebookApp.file_to_run = ''
-
-## Use minified JS file or not, mainly use during dev to avoid JS recompilation
-#c.NotebookApp.ignore_minified_js = False
-
-## (bytes/sec) Maximum rate at which messages can be sent on iopub before they
-#  are limited.
-#c.NotebookApp.iopub_data_rate_limit = 0
-
-## (msg/sec) Maximum rate at which messages can be sent on iopub before they are
-#  limited.
-#c.NotebookApp.iopub_msg_rate_limit = 0
-
-## The IP address the notebook server will listen on.
-c.NotebookApp.ip = '*'
-
-## Supply extra arguments that will be passed to Jinja environment.
-#c.NotebookApp.jinja_environment_options = {}
-
-## Extra variables to supply to jinja templates when rendering.
-#c.NotebookApp.jinja_template_vars = {}
-
-## The kernel manager class to use.
-#c.NotebookApp.kernel_manager_class = 'notebook.services.kernels.kernelmanager.MappingKernelManager'
-
-## The kernel spec manager class to use. Should be a subclass of
-#  `jupyter_client.kernelspec.KernelSpecManager`.
-#  
-#  The Api of KernelSpecManager is provisional and might change without warning
-#  between this version of Jupyter and the next stable one.
-#c.NotebookApp.kernel_spec_manager_class = 'jupyter_client.kernelspec.KernelSpecManager'
-
-## The full path to a private key file for usage with SSL/TLS.
-#c.NotebookApp.keyfile = u''
-
-## The login handler class to use.
-#c.NotebookApp.login_handler_class = 'notebook.auth.login.LoginHandler'
-
-## The logout handler class to use.
-#c.NotebookApp.logout_handler_class = 'notebook.auth.logout.LogoutHandler'
-
-## A custom url for MathJax.js. Should be in the form of a case-sensitive url to
-#  MathJax, for example:  /static/components/MathJax/MathJax.js
-#c.NotebookApp.mathjax_url = ''
-
-## Dict of Python modules to load as notebook server extensions.Entry values can
-#  be used to enable and disable the loading ofthe extensions. The extensions
-#  will be loaded in alphabetical order.
-#c.NotebookApp.nbserver_extensions = {}
-
-## The directory to use for notebooks and kernels.
-#c.NotebookApp.notebook_dir = u''
-
-## Whether to open in a browser after starting. The specific browser used is
-#  platform dependent and determined by the python standard library `webbrowser`
-#  module, unless it is overridden using the --browser (NotebookApp.browser)
-#  configuration option.
-c.NotebookApp.open_browser = False 
-
-## Hashed password to use for web authentication.
-#  
-#  To generate, type in a python/IPython shell:
-#  
-#    from notebook.auth import passwd; passwd()
-#  
-#  The string should be of the form type:salt:hashed-password.
-#c.NotebookApp.password = u''
-
-## The port the notebook server will listen on.
-c.NotebookApp.port = 8888
-
-## The number of additional ports to try if the specified port is not available.
-#c.NotebookApp.port_retries = 50
-
-## DISABLED: use %pylab or %matplotlib in the notebook to enable matplotlib.
-#c.NotebookApp.pylab = 'disabled'
-
-## (sec) Time window used to  check the message and data rate limits.
-#c.NotebookApp.rate_limit_window = 1.0
-
-## Reraise exceptions encountered loading server extensions?
-#c.NotebookApp.reraise_server_extension_failures = False
-
-## DEPRECATED use the nbserver_extensions dict instead
-#c.NotebookApp.server_extensions = []
-
-## The session manager class to use.
-#c.NotebookApp.session_manager_class = 'notebook.services.sessions.sessionmanager.SessionManager'
-
-## Supply SSL options for the tornado HTTPServer. See the tornado docs for
-#  details.
-#c.NotebookApp.ssl_options = {}
-
-## Token used for authenticating first-time connections to the server.
-#  
-#  When no password is enabled, the default is to generate a new, random token.
-#  
-#  Setting to an empty string disables authentication altogether, which is NOT
-#  RECOMMENDED.
-#c.NotebookApp.token = '<generated>'
-
-## Supply overrides for the tornado.web.Application that the Jupyter notebook
-#  uses.
-#c.NotebookApp.tornado_settings = {}
-
-## Whether to trust or not X-Scheme/X-Forwarded-Proto and X-Real-Ip/X-Forwarded-
-#  For headerssent by the upstream reverse proxy. Necessary if the proxy handles
-#  SSL
-#c.NotebookApp.trust_xheaders = False
-
-## DEPRECATED, use tornado_settings
-#c.NotebookApp.webapp_settings = {}
-
-## The base URL for websockets, if it differs from the HTTP server (hint: it
-#  almost certainly doesn't).
-#  
-#  Should be in the form of an HTTP origin: ws[s]://hostname[:port]
-#c.NotebookApp.websocket_url = ''
-
-#------------------------------------------------------------------------------
-# ConnectionFileMixin(LoggingConfigurable) configuration
-#------------------------------------------------------------------------------
-
-## Mixin for configurable classes that work with connection files
-
-## JSON file in which to store connection info [default: kernel-<pid>.json]
-#  
-#  This file will contain the IP, ports, and authentication key needed to connect
-#  clients to this kernel. By default, this file will be created in the security
-#  dir of the current profile, but can be specified by absolute path.
-#c.ConnectionFileMixin.connection_file = ''
-
-## set the control (ROUTER) port [default: random]
-#c.ConnectionFileMixin.control_port = 0
-
-## set the heartbeat port [default: random]
-#c.ConnectionFileMixin.hb_port = 0
-
-## set the iopub (PUB) port [default: random]
-#c.ConnectionFileMixin.iopub_port = 0
-
-## Set the kernel's IP address [default localhost]. If the IP address is
-#  something other than localhost, then Consoles on other machines will be able
-#  to connect to the Kernel, so be careful!
-#c.ConnectionFileMixin.ip = u''
-
-## set the shell (ROUTER) port [default: random]
-#c.ConnectionFileMixin.shell_port = 0
-
-## set the stdin (ROUTER) port [default: random]
-#c.ConnectionFileMixin.stdin_port = 0
-
-## 
-#c.ConnectionFileMixin.transport = 'tcp'
-
-#------------------------------------------------------------------------------
-# KernelManager(ConnectionFileMixin) configuration
-#------------------------------------------------------------------------------
-
-## Manages a single kernel in a subprocess on this host.
-#  
-#  This version starts kernels with Popen.
-
-## Should we autorestart the kernel if it dies.
-#c.KernelManager.autorestart = True
-
-## DEPRECATED: Use kernel_name instead.
-#  
-#  The Popen Command to launch the kernel. Override this if you have a custom
-#  kernel. If kernel_cmd is specified in a configuration file, Jupyter does not
-#  pass any arguments to the kernel, because it cannot make any assumptions about
-#  the arguments that the kernel understands. In particular, this means that the
-#  kernel does not receive the option --debug if it given on the Jupyter command
-#  line.
-#c.KernelManager.kernel_cmd = []
-
-## Time to wait for a kernel to terminate before killing it, in seconds.
-#c.KernelManager.shutdown_wait_time = 5.0
-
-#------------------------------------------------------------------------------
-# Session(Configurable) configuration
-#------------------------------------------------------------------------------
-
-## Object for handling serialization and sending of messages.
-#  
-#  The Session object handles building messages and sending them with ZMQ sockets
-#  or ZMQStream objects.  Objects can communicate with each other over the
-#  network via Session objects, and only need to work with the dict-based IPython
-#  message spec. The Session will handle serialization/deserialization, security,
-#  and metadata.
-#  
-#  Sessions support configurable serialization via packer/unpacker traits, and
-#  signing with HMAC digests via the key/keyfile traits.
-#  
-#  Parameters ----------
-#  
-#  debug : bool
-#      whether to trigger extra debugging statements
-#  packer/unpacker : str : 'json', 'pickle' or import_string
-#      importstrings for methods to serialize message parts.  If just
-#      'json' or 'pickle', predefined JSON and pickle packers will be used.
-#      Otherwise, the entire importstring must be used.
-#  
-#      The functions must accept at least valid JSON input, and output *bytes*.
-#  
-#      For example, to use msgpack:
-#      packer = 'msgpack.packb', unpacker='msgpack.unpackb'
-#  pack/unpack : callables
-#      You can also set the pack/unpack callables for serialization directly.
-#  session : bytes
-#      the ID of this Session object.  The default is to generate a new UUID.
-#  username : unicode
-#      username added to message headers.  The default is to ask the OS.
-#  key : bytes
-#      The key used to initialize an HMAC signature.  If unset, messages
-#      will not be signed or checked.
-#  keyfile : filepath
-#      The file containing a key.  If this is set, `key` will be initialized
-#      to the contents of the file.
-
-## Threshold (in bytes) beyond which an object's buffer should be extracted to
-#  avoid pickling.
-#c.Session.buffer_threshold = 1024
-
-## Whether to check PID to protect against calls after fork.
-#  
-#  This check can be disabled if fork-safety is handled elsewhere.
-#c.Session.check_pid = True
-
-## Threshold (in bytes) beyond which a buffer should be sent without copying.
-#c.Session.copy_threshold = 65536
-
-## Debug output in the Session
-#c.Session.debug = False
-
-## The maximum number of digests to remember.
-#  
-#  The digest history will be culled when it exceeds this value.
-#c.Session.digest_history_size = 65536
-
-## The maximum number of items for a container to be introspected for custom
-#  serialization. Containers larger than this are pickled outright.
-#c.Session.item_threshold = 64
-
-## execution key, for signing messages.
-#c.Session.key = ''
-
-## path to file containing execution key.
-#c.Session.keyfile = ''
-
-## Metadata dictionary, which serves as the default top-level metadata dict for
-#  each message.
-#c.Session.metadata = {}
-
-## The name of the packer for serializing messages. Should be one of 'json',
-#  'pickle', or an import name for a custom callable serializer.
-#c.Session.packer = 'json'
-
-## The UUID identifying this session.
-#c.Session.session = u''
-
-## The digest scheme used to construct the message signatures. Must have the form
-#  'hmac-HASH'.
-#c.Session.signature_scheme = 'hmac-sha256'
-
-## The name of the unpacker for unserializing messages. Only used with custom
-#  functions for `packer`.
-#c.Session.unpacker = 'json'
-
-## Username for the Session. Default is your system username.
-#c.Session.username = u'username'
-
-#------------------------------------------------------------------------------
-# MultiKernelManager(LoggingConfigurable) configuration
-#------------------------------------------------------------------------------
-
-## A class for managing multiple kernels.
-
-## The name of the default kernel to start
-#c.MultiKernelManager.default_kernel_name = 'python2'
-
-## The kernel manager class.  This is configurable to allow subclassing of the
-#  KernelManager for customized behavior.
-#c.MultiKernelManager.kernel_manager_class = 'jupyter_client.ioloop.IOLoopKernelManager'
-
-#------------------------------------------------------------------------------
-# MappingKernelManager(MultiKernelManager) configuration
-#------------------------------------------------------------------------------
-
-## A KernelManager that handles notebook mapping and HTTP error handling
-
-## 
-#c.MappingKernelManager.root_dir = u''
-
-#------------------------------------------------------------------------------
-# ContentsManager(LoggingConfigurable) configuration
-#------------------------------------------------------------------------------
-
-## Base class for serving files and directories.
-#  
-#  This serves any text or binary file, as well as directories, with special
-#  handling for JSON notebook documents.
-#  
-#  Most APIs take a path argument, which is always an API-style unicode path, and
-#  always refers to a directory.
-#  
-#  - unicode, not url-escaped
-#  - '/'-separated
-#  - leading and trailing '/' will be stripped
-#  - if unspecified, path defaults to '',
-#    indicating the root path.
-
-## 
-#c.ContentsManager.checkpoints = None
-
-## 
-#c.ContentsManager.checkpoints_class = 'notebook.services.contents.checkpoints.Checkpoints'
-
-## 
-#c.ContentsManager.checkpoints_kwargs = {}
-
-## Glob patterns to hide in file and directory listings.
-#c.ContentsManager.hide_globs = [u'__pycache__', '*.pyc', '*.pyo', '.DS_Store', '*.so', '*.dylib', '*~']
-
-## Python callable or importstring thereof
-#  
-#  To be called on a contents model prior to save.
-#  
-#  This can be used to process the structure, such as removing notebook outputs
-#  or other side effects that should not be saved.
-#  
-#  It will be called as (all arguments passed by keyword)::
-#  
-#      hook(path=path, model=model, contents_manager=self)
-#  
-#  - model: the model to be saved. Includes file contents.
-#    Modifying this dict will affect the file that is stored.
-#  - path: the API path of the save destination
-#  - contents_manager: this ContentsManager instance
-#c.ContentsManager.pre_save_hook = None
-
-## The base name used when creating untitled directories.
-#c.ContentsManager.untitled_directory = 'Untitled Folder'
-
-## The base name used when creating untitled files.
-#c.ContentsManager.untitled_file = 'untitled'
-
-## The base name used when creating untitled notebooks.
-#c.ContentsManager.untitled_notebook = 'Untitled'
-
-#------------------------------------------------------------------------------
-# FileManagerMixin(Configurable) configuration
-#------------------------------------------------------------------------------
-
-## Mixin for ContentsAPI classes that interact with the filesystem.
-#  
-#  Provides facilities for reading, writing, and copying both notebooks and
-#  generic files.
-#  
-#  Shared by FileContentsManager and FileCheckpoints.
-#  
-#  Note ---- Classes using this mixin must provide the following attributes:
-#  
-#  root_dir : unicode
-#      A directory against against which API-style paths are to be resolved.
-#  
-#  log : logging.Logger
-
-## By default notebooks are saved on disk on a temporary file and then if
-#  succefully written, it replaces the old ones. This procedure, namely
-#  'atomic_writing', causes some bugs on file system whitout operation order
-#  enforcement (like some networked fs). If set to False, the new notebook is
-#  written directly on the old one which could fail (eg: full filesystem or quota
-#  )
-#c.FileManagerMixin.use_atomic_writing = True
-
-#------------------------------------------------------------------------------
-# FileContentsManager(FileManagerMixin,ContentsManager) configuration
-#------------------------------------------------------------------------------
-
-## Python callable or importstring thereof
-#  
-#  to be called on the path of a file just saved.
-#  
-#  This can be used to process the file on disk, such as converting the notebook
-#  to a script or HTML via nbconvert.
-#  
-#  It will be called as (all arguments passed by keyword)::
-#  
-#      hook(os_path=os_path, model=model, contents_manager=instance)
-#  
-#  - path: the filesystem path to the file just written - model: the model
-#  representing the file - contents_manager: this ContentsManager instance
-#c.FileContentsManager.post_save_hook = None
-
-## 
-#c.FileContentsManager.root_dir = u''
-
-## DEPRECATED, use post_save_hook. Will be removed in Notebook 5.0
-#c.FileContentsManager.save_script = False
-
-#------------------------------------------------------------------------------
-# NotebookNotary(LoggingConfigurable) configuration
-#------------------------------------------------------------------------------
-
-## A class for computing and verifying notebook signatures.
-
-## The hashing algorithm used to sign notebooks.
-#c.NotebookNotary.algorithm = 'sha256'
-
-## The sqlite file in which to store notebook signatures. By default, this will
-#  be in your Jupyter data directory. You can set it to ':memory:' to disable
-#  sqlite writing to the filesystem.
-#c.NotebookNotary.db_file = u''
-
-## The secret key with which notebooks are signed.
-#c.NotebookNotary.secret = ''
-
-## The file where the secret key is stored.
-#c.NotebookNotary.secret_file = u''
-
-## A callable returning the storage backend for notebook signatures. The default
-#  uses an SQLite database.
-#c.NotebookNotary.store_factory = traitlets.Undefined
-
-#------------------------------------------------------------------------------
-# KernelSpecManager(LoggingConfigurable) configuration
-#------------------------------------------------------------------------------
-
-## If there is no Python kernelspec registered and the IPython kernel is
-#  available, ensure it is added to the spec list.
-#c.KernelSpecManager.ensure_native_kernel = True
-
-## The kernel spec class.  This is configurable to allow subclassing of the
-#  KernelSpecManager for customized behavior.
-#c.KernelSpecManager.kernel_spec_class = 'jupyter_client.kernelspec.KernelSpec'
-
-## Whitelist of allowed kernel names.
-#  
-#  By default, all installed kernels are allowed.
-#c.KernelSpecManager.whitelist = set([])
diff --git a/model_infer_gpu.sh b/model_infer_gpu.sh
new file mode 100755
index 0000000..3049bcb
--- /dev/null
+++ b/model_infer_gpu.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#!/bin/bash
+python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 2 --gpus 0,1 --batch-size 8 \
+  --disp-batches 1 --num-epochs 1 --model-prefix trained_model --dropout 0.0 \
+  --infer --load-epoch 1 --attention
+# --attention
+# --input-feed --remove-state-feed
+# --use-cudnn-cells
+# --use-cudnn-cells 
+# --inference-unrolling-for-training
diff --git a/seq2seq/model_train_cpu.sh b/model_train_cpu.sh
similarity index 100%
rename from seq2seq/model_train_cpu.sh
rename to model_train_cpu.sh
diff --git a/model_train_gpu.sh b/model_train_gpu.sh
new file mode 100755
index 0000000..6449087
--- /dev/null
+++ b/model_train_gpu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 2 --gpus 0,1 --batch-size 256 \
+  --optimizer adagrad --lr 0.01 --disp-batches 10 --num-epochs 10 \
+  --dropout 0.3 --seed 1234 --model-prefix trained_model --model-prefix trained_model --attention --input-feed --remove-state-feed
+# --attention
+# --input-feed --remove-state-feed
+#  --model-prefix trained_model
+# --use-cudnn-cells 
+#  --inference-unrolling-for-training
diff --git a/model_train_gpu_alt_unrolling.sh b/model_train_gpu_alt_unrolling.sh
new file mode 100755
index 0000000..9d7f816
--- /dev/null
+++ b/model_train_gpu_alt_unrolling.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 2 --gpus 0 --batch-size 128 \
+  --optimizer adagrad --lr 0.141 --disp-batches 100 --num-epochs 1 --model-prefix trained_model \
+  --dropout 0.3 --seed 1234
+
+# --inference-unrolling-for-training
+
diff --git a/nan.patch b/nan.patch
deleted file mode 100644
index e858c1e..0000000
--- a/nan.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
-index 396f5a1..544eab2 100644
---- a/python/mxnet/callback.py
-+++ b/python/mxnet/callback.py
-@@ -96,13 +96,16 @@ class Speedometer(object):
-     frequent: int
-         How many batches between calculations.
-         Defaults to calculating & logging every 50 batches.
-+    auto_reset : bool
-+        Reset the metric after each log.
-     """
--    def __init__(self, batch_size, frequent=50):
-+    def __init__(self, batch_size, frequent=50, auto_reset=False):
-         self.batch_size = batch_size
-         self.frequent = frequent
-         self.init = False
-         self.tic = 0
-         self.last_count = 0
-+        self.auto_reset = auto_reset
- 
-     def __call__(self, param):
-         """Callback to Show speed."""
-@@ -116,7 +119,8 @@ class Speedometer(object):
-                 speed = self.frequent * self.batch_size / (time.time() - self.tic)
-                 if param.eval_metric is not None:
-                     name_value = param.eval_metric.get_name_value()
--                    param.eval_metric.reset()
-+                    if self.auto_reset:
-+                        param.eval_metric.reset()
-                     for name, value in name_value:
-                         logging.info('Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f',
-                                      param.epoch, count, speed, name, value)
-diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
-index 2870fab..8a8b974 100644
---- a/python/mxnet/metric.py
-+++ b/python/mxnet/metric.py
-@@ -265,7 +265,8 @@ class Perplexity(EvalMetric):
-         self.num_inst += num
- 
-     def get(self):
--        return (self.name, math.exp(self.sum_metric/self.num_inst))
-+        num = self.num_inst if self.num_inst > 0 else float('nan')
-+        return (self.name, math.exp(self.sum_metric/num))
- 
- ####################
- # REGRESSION METRICS
diff --git a/seq2seq/preprocess_data.py b/preprocess_data.py
similarity index 65%
rename from seq2seq/preprocess_data.py
rename to preprocess_data.py
index 274b7f1..7836dea 100644
--- a/seq2seq/preprocess_data.py
+++ b/preprocess_data.py
@@ -28,19 +28,14 @@
     start = time()
 
     dataset = get_s2s_data(
-        src_train_path= './data/wmt15-de-en/train.en',
-        src_valid_path= './data/wmt15-de-en/valid.en',
-        targ_train_path= './data/wmt15-de-en/train.de',
-        targ_valid_path= './data/wmt15-de-en/valid.de' 
+        src_train_path=  './data/wmt15-de-en/train.de',
+        src_valid_path=  './data/wmt15-de-en/valid.de',
+        src_test_path = './data/wmt15-de-en/test.de',
+        targ_train_path= './data/wmt15-de-en/train.en',
+        targ_valid_path= './data/wmt15-de-en/valid.en',
+        targ_test_path = './data/wmt15-de-en/test.en'
     )
 
-#    dataset = get_s2s_data(
-#        src_train_path=  './data/europarl-v7.es-en.en_train_small',
-#        src_valid_path=  './data/europarl-v7.es-en.en_valid_small',
-#        targ_train_path= './data/europarl-v7.es-en.es_train_small',
-#        targ_valid_path= './data/europarl-v7.es-en.en_valid_small'
-#    )
-
    
     preproc_duration = time() - start
     print("\nPreprocessing data took %.4f seconds\n" % preproc_duration)
@@ -59,11 +54,12 @@
 #    all_pairs = [(5, 5), (15,15), (20, 20)]
 #    max_sent_len = 25
 
+    batch_size=64
 
     print("Constructing train iterator")
     start = time()
     train_iter = Seq2SeqIter(dataset.src_train_sent, dataset.targ_train_sent, dataset.src_vocab, dataset.inv_src_vocab,
-                     dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=64, buckets=all_pairs, max_sent_len=max_len)
+                     dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=batch_size, buckets=all_pairs, max_sent_len=max_len)
 
     train_iter.bucketize()
     train_iter_duration = time() - start
@@ -80,7 +76,7 @@
     print("Constructing valid iterator")
     valid_iter_duration = time()
     valid_iter = Seq2SeqIter(dataset.src_valid_sent, dataset.targ_valid_sent, dataset.src_vocab, dataset.inv_src_vocab,
-                     dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=64, buckets=all_pairs, max_sent_len=50)
+                     dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=batch_size, buckets=all_pairs, max_sent_len=max_len)
 
     valid_iter.bucketize()
     valid_iter_duration = time() - start
@@ -89,7 +85,23 @@
     print("Serializing validation set iterator.")
     start = time()
     with open('./data/valid_iterator.pkl', 'wb') as f:
-        pickle.dump(train_iter, f, pickle.HIGHEST_PROTOCOL)
+        pickle.dump(valid_iter, f, pickle.HIGHEST_PROTOCOL)
     valid_ser_duration = time() - start
     print("\nSerializing validation set iterator took %.4f seconds\n" % valid_ser_duration)
 
+    print("Constructing test iterator")
+    test_iter_duration = time()
+    test_iter = Seq2SeqIter(dataset.src_test_sent, dataset.targ_test_sent, dataset.src_vocab, dataset.inv_src_vocab,
+                     dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=batch_size, buckets=all_pairs, max_sent_len=max_len)
+
+    test_iter.bucketize()
+    test_iter_duration = time() - start
+    print("\nBucketizing data for test set iterator took %.4f seconds\n" % test_iter_duration)
+
+    print("Serializing test set iterator.")
+    start = time()
+    with open('./data/test_iterator.pkl', 'wb') as f:
+        pickle.dump(test_iter, f, pickle.HIGHEST_PROTOCOL)
+    test_ser_duration = time() - start
+    print("\nSerializing test set iterator took %.4f seconds\n" % test_ser_duration)
+
diff --git a/seq2seq/get_opennmt_data.sh b/seq2seq/get_opennmt_data.sh
deleted file mode 100755
index abad5d1..0000000
--- a/seq2seq/get_opennmt_data.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/bin/bash
-
-DATA_DIR_ROOT="./data"
-DATA_DIR="${DATA_DIR_ROOT}/wmt15-de-en"
-IN_SRC_DATA_PATH="${DATA_DIR}/all.en"
-IN_TARG_DATA_PATH="${DATA_DIR}/all.de"
-OUT_SRC_TRAIN_PATH="${DATA_DIR}/train.en"
-OUT_TARG_TRAIN_PATH="${DATA_DIR}/train.de"
-OUT_SRC_VALID_PATH="${DATA_DIR}/valid.en"
-OUT_TARG_VALID_PATH="${DATA_DIR}/valid.de"
-VALIDATION_FRACTION=0.2
-SHUFFLE_SEED=42
-
-mkdir -p ${DATA_DIR}
-
-pushd . > /dev/null
-
-cd ${DATA_DIR_ROOT}
-
-echo -e "\nDownloading dataset"
-
-wget https://s3.amazonaws.com/opennmt-trainingdata/wmt15-de-en.tgz
-
-echo -e "\nDecompressing dataset\n"
-
-tar xvf wmt15-de-en.tgz
-
-echo -e "\nConcatenating corpora"
-
-cd wmt15-de-en
-
-# concatenate corpora - note concatenation has to be in 
-# the same order for both languages
-
-# we will split this into training and validation sets
-cat commoncrawl.de-en.de europarl-v7.de-en.de news-commentary-v10.de-en.de > all.de
-# the test set already officially exists
-mv newstest2013.de test.de
-
-# do the same thing to English corpora
-
-cat commoncrawl.de-en.en europarl-v7.de-en.en news-commentary-v10.de-en.en > all.en
-mv newstest2013.en test.en
-
-popd > /dev/null
-
-echo -e "\nShuffling examples and splitting into training and validation samples"
-
-
-# shuffle examples so validation data isn't completely from one
-# corpus while training is from another
-
-python split_train_valid.py \
-  --in-src-data-path ${IN_SRC_DATA_PATH} \
-  --in-targ-data-path ${IN_TARG_DATA_PATH} \
-  --out-src-train-path ${OUT_SRC_TRAIN_PATH} \
-  --out-targ-train-path ${OUT_TARG_TRAIN_PATH} \
-  --out-src-valid-path ${OUT_SRC_VALID_PATH} \
-  --out-targ-valid-path ${OUT_TARG_VALID_PATH} \
-  --validation-fraction 0.2 \
-  --shuffle-seed 42
-
-
diff --git a/seq2seq/model_infer_gpu.sh b/seq2seq/model_infer_gpu.sh
deleted file mode 100755
index 1755ec1..0000000
--- a/seq2seq/model_infer_gpu.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 1 --gpus 0,1 --batch-size 256 --disp-batches 1 --num-epochs 1 --model-prefix trained_model --dropout 0.5 --infer --load-epoch 1 
-# --use-cudnn-cells
diff --git a/seq2seq/model_train_gpu.sh b/seq2seq/model_train_gpu.sh
deleted file mode 100755
index 3f4e987..0000000
--- a/seq2seq/model_train_gpu.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 22 --gpus 0,1 --batch-size 256 --optimizer adagrad --lr 0.1 --disp-batches 1 --num-epochs 1 --model-prefix trained_model --dropout 0.5
-# --use-cudnn-cells 
diff --git a/seq2seq/rnn_cell.py b/seq2seq/rnn_cell.py
deleted file mode 100644
index f93072f..0000000
--- a/seq2seq/rnn_cell.py
+++ /dev/null
@@ -1,951 +0,0 @@
-# coding: utf-8
-# pylint: disable=no-member, invalid-name, protected-access, no-self-use
-# pylint: disable=too-many-branches, too-many-arguments, no-self-use
-# pylint: disable=too-many-lines
-"""Definition of various recurrent neural network cells."""
-from __future__ import print_function
-
-import warnings
-
-import mxnet as mx
-from mxnet import symbol, init, ndarray, _symbol_internal
-from mxnet.base import string_types, numeric_types
-
-
-def _cells_state_shape(cells):
-    return sum([c.state_shape for c in cells], [])
-
-def _cells_begin_state(cells, **kwargs):
-    return sum([c.begin_state(**kwargs) for c in cells], [])
-
-def _cells_unpack_weights(cells, args):
-    for cell in cells:
-        args = cell.unpack_weights(args)
-    return args
-
-def _cells_pack_weights(cells, args):
-    for cell in cells:
-        args = cell.pack_weights(args)
-    return args
-
-def _normalize_sequence(length, inputs, layout, merge, in_layout=None):
-    assert inputs is not None, \
-        "unroll(inputs=None) has been deprecated. " \
-        "Please create input variables outside unroll."
-
-    axis = layout.find('T')
-    in_axis = in_layout.find('T') if in_layout is not None else axis
-    if isinstance(inputs, symbol.Symbol):
-        if merge is False:
-            assert len(inputs.list_outputs()) == 1, \
-                "unroll doesn't allow grouped symbol as input. Please convert " \
-                "to list with list(inputs) first or let unroll handle splitting."
-            inputs = list(symbol.split(inputs, axis=in_axis, num_outputs=length,
-                                       squeeze_axis=1))
-    else:
-        assert length is None or len(inputs) == length
-        if merge is True:
-            inputs = [symbol.expand_dims(i, axis=axis) for i in inputs]
-            inputs = symbol.Concat(*inputs, dim=axis)
-            in_axis = axis
-
-    if isinstance(inputs, symbol.Symbol) and axis != in_axis:
-        inputs = symbol.swapaxes(inputs, dim0=axis, dim1=in_axis)
-
-    return inputs, axis
-
-
-class RNNParams(object):
-    """Container for holding variables.
-    Used by RNN cells for parameter sharing between cells.
-
-    Parameters
-    ----------
-    prefix : str
-        All variables' name created by this container will
-        be prepended with prefix
-    """
-    def __init__(self, prefix=''):
-        self._prefix = prefix
-        self._params = {}
-
-    def get(self, name, **kwargs):
-        """Get a variable with name or create a new one if missing.
-
-        Parameters
-        ----------
-        name : str
-            name of the variable
-        **kwargs :
-            more arguments that's passed to symbol.Variable
-        """
-        name = self._prefix + name
-        if name not in self._params:
-            self._params[name] = symbol.Variable(name, **kwargs)
-        return self._params[name]
-
-
-class BaseRNNCell(object):
-    """Abstract base class for RNN cells
-
-    Parameters
-    ----------
-    prefix : str
-        prefix for name of layers
-        (and name of weight if params is None)
-    params : RNNParams or None
-        container for weight sharing between cells.
-        created if None.
-    """
-    def __init__(self, prefix='', params=None):
-        if params is None:
-            params = RNNParams(prefix)
-            self._own_params = True
-        else:
-            self._own_params = False
-        self._prefix = prefix
-        self._params = params
-        self._modified = False
-
-        self.reset()
-
-    def reset(self):
-        """Reset before re-using the cell for another graph"""
-        self._init_counter = -1
-        self._counter = -1
-
-    def __call__(self, inputs, states):
-        """Construct symbol for one step of RNN.
-
-        Parameters
-        ----------
-        inputs : sym.Variable
-            input symbol, 2D, batch * num_units
-        states : sym.Variable
-            state from previous step or begin_state().
-
-        Returns
-        -------
-        output : Symbol
-            output symbol
-        states : Symbol
-            state to next step of RNN.
-        """
-        raise NotImplementedError()
-
-    @property
-    def params(self):
-        """Parameters of this cell"""
-        self._own_params = False
-        return self._params
-
-    @property
-    def state_shape(self):
-        """shape(s) of states"""
-        raise NotImplementedError()
-
-    @property
-    def _gate_names(self):
-        """name(s) of gates"""
-        return ()
-
-    def begin_state(self, func=symbol.zeros, **kwargs):
-        """Initial state for this cell.
-
-        Parameters
-        ----------
-        func : callable, default symbol.zeros
-            Function for creating initial state. Can be symbol.zeros,
-            symbol.uniform, symbol.Variable etc.
-            Use symbol.Variable if you want to directly
-            feed input as states.
-        **kwargs :
-            more keyword arguments passed to func. For example
-            mean, std, dtype, etc.
-
-        Returns
-        -------
-        states : nested list of Symbol
-            starting states for first RNN step
-        """
-        assert not self._modified, \
-            "After applying modifier cells (e.g. DropoutCell) the base " \
-            "cell cannot be called directly. Call the modifier cell instead."
-        states = []
-        for shape in self.state_shape:
-            self._init_counter += 1
-            if shape is None:
-                state = func(name='%sbegin_state_%d'%(self._prefix, self._init_counter),
-                             **kwargs)
-            else:
-                state = func(name='%sbegin_state_%d'%(self._prefix, self._init_counter),
-                             shape=shape, **kwargs)
-            states.append(state)
-        return states
-
-    def unpack_weights(self, args):
-        """Unpack fused weight matrices into separate
-        weight matrices
-
-        Parameters
-        ----------
-        args : dict of str -> NDArray
-            dictionary containing packed weights.
-            usually from Module.get_output()
-
-        Returns
-        -------
-        args : dict of str -> NDArray
-            dictionary with weights associated to
-            this cell unpacked.
-        """
-        args = args.copy()
-        if not self._gate_names:
-            return args
-        h = self._num_hidden
-        for group_name in ['i2h', 'h2h']:
-            weight = args.pop('%s%s_weight'%(self._prefix, group_name))
-            bias = args.pop('%s%s_bias' % (self._prefix, group_name))
-            for j, gate in enumerate(self._gate_names):
-                wname = '%s%s%s_weight' % (self._prefix, group_name, gate)
-                args[wname] = weight[j*h:(j+1)*h].copy()
-                bname = '%s%s%s_bias' % (self._prefix, group_name, gate)
-                args[bname] = bias[j*h:(j+1)*h].copy()
-        return args
-
-    def pack_weights(self, args):
-        """Pack separate weight matrices into fused
-        weight.
-
-        Parameters
-        ----------
-        args : dict of str -> NDArray
-            dictionary containing unpacked weights.
-
-        Returns
-        -------
-        args : dict of str -> NDArray
-            dictionary with weights associated to
-            this cell packed.
-        """
-        args = args.copy()
-        if not self._gate_names:
-            return args
-        for group_name in ['i2h', 'h2h']:
-            weight = []
-            bias = []
-            for gate in self._gate_names:
-                wname = '%s%s%s_weight'%(self._prefix, group_name, gate)
-                weight.append(args.pop(wname))
-                bname = '%s%s%s_bias'%(self._prefix, group_name, gate)
-                bias.append(args.pop(bname))
-            args['%s%s_weight'%(self._prefix, group_name)] = ndarray.concatenate(weight)
-            args['%s%s_bias'%(self._prefix, group_name)] = ndarray.concatenate(bias)
-        return args
-
-    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
-        """Unroll an RNN cell across time steps.
-
-        Parameters
-        ----------
-        length : int
-            number of steps to unroll
-        inputs : Symbol, list of Symbol, or None
-            if inputs is a single Symbol (usually the output
-            of Embedding symbol), it should have shape
-            (batch_size, length, ...) if layout == 'NTC',
-            or (length, batch_size, ...) if layout == 'TNC'.
-
-            If inputs is a list of symbols (usually output of
-            previous unroll), they should all have shape
-            (batch_size, ...).
-        begin_state : nested list of Symbol
-            input states. Created by begin_state()
-            or output state of another cell. Created
-            from begin_state() if None.
-        layout : str
-            layout of input symbol. Only used if inputs
-            is a single Symbol.
-        merge_outputs : bool
-            If False, return outputs as a list of Symbols.
-            If True, concatenate output across time steps
-            and return a single symbol with shape
-            (batch_size, length, ...) if layout == 'NTC',
-            or (length, batch_size, ...) if layout == 'TNC'.
-            If None, output whatever is faster
-
-        Returns
-        -------
-        outputs : list of Symbol
-            output symbols.
-        states : Symbol or nested list of Symbol
-            has the same structure as begin_state()
-        """
-        self.reset()
-
-        inputs, _ = _normalize_sequence(length, inputs, layout, False)
-        if begin_state is None:
-            begin_state = self.begin_state()
-
-        states = begin_state
-        outputs = []
-        for i in range(length):
-            output, states = self(inputs[i], states)
-            outputs.append(output)
-
-        outputs, _ = _normalize_sequence(length, outputs, layout, merge_outputs)
-
-        return outputs, states
-
-    #pylint: disable=no-self-use
-    def _get_activation(self, inputs, activation, **kwargs):
-        """Get activation function. Convert if is string"""
-        if isinstance(activation, string_types):
-            return symbol.Activation(inputs, act_type=activation, **kwargs)
-        else:
-            return activation(inputs, **kwargs)
-
-
-class RNNCell(BaseRNNCell):
-    """Simple recurrent neural network cell
-
-    Parameters
-    ----------
-    num_hidden : int
-        number of units in output symbol
-    activation : str or Symbol, default 'tanh'
-        type of activation function
-    prefix : str, default 'rnn_'
-        prefix for name of layers
-        (and name of weight if params is None)
-    params : RNNParams or None
-        container for weight sharing between cells.
-        created if None.
-    """
-    def __init__(self, num_hidden, activation='tanh', prefix='rnn_', params=None):
-        super(RNNCell, self).__init__(prefix=prefix, params=params)
-        self._num_hidden = num_hidden
-        self._activation = activation
-        self._iW = self.params.get('i2h_weight')
-        self._iB = self.params.get('i2h_bias')
-        self._hW = self.params.get('h2h_weight')
-        self._hB = self.params.get('h2h_bias')
-
-    @property
-    def state_shape(self):
-        return [(0, self._num_hidden)]
-
-    @property
-    def _gate_names(self):
-        return ('',)
-
-    def __call__(self, inputs, states):
-        self._counter += 1
-        name = '%st%d_'%(self._prefix, self._counter)
-        i2h = symbol.FullyConnected(data=inputs, weight=self._iW, bias=self._iB,
-                                    num_hidden=self._num_hidden,
-                                    name='%si2h'%name)
-        h2h = symbol.FullyConnected(data=states[0], weight=self._hW, bias=self._hB,
-                                    num_hidden=self._num_hidden,
-                                    name='%sh2h'%name)
-        output = self._get_activation(i2h + h2h, self._activation,
-                                      name='%sout'%name)
-
-        return output, [output]
-
-
-class LSTMCell(BaseRNNCell):
-    """Long-Short Term Memory (LSTM) network cell.
-
-    Parameters
-    ----------
-    num_hidden : int
-        number of units in output symbol
-    prefix : str, default 'rnn_'
-        prefix for name of layers
-        (and name of weight if params is None)
-    params : RNNParams or None
-        container for weight sharing between cells.
-        created if None.
-    forget_bias : bias added to forget gate, default 1.0.
-        Jozefowicz et al. 2015 recommends setting this to 1.0
-    """
-    def __init__(self, num_hidden, prefix='lstm_', params=None, forget_bias=1.0):
-        super(LSTMCell, self).__init__(prefix=prefix, params=params)
-
-        self._num_hidden = num_hidden
-        self._iW = self.params.get('i2h_weight')
-        self._hW = self.params.get('h2h_weight')
-        # we add the forget_bias to i2h_bias, this adds the bias to the forget gate activation
-        self._iB = self.params.get('i2h_bias', init=init.LSTMBias(forget_bias=forget_bias))
-        self._hB = self.params.get('h2h_bias')
-
-    @property
-    def state_shape(self):
-        return [(0, self._num_hidden), (0, self._num_hidden)]
-
-    @property
-    def _gate_names(self):
-        return ['_i', '_f', '_c', '_o']
-
-    def __call__(self, inputs, states):
-        self._counter += 1
-        name = '%st%d_'%(self._prefix, self._counter)
-        i2h = symbol.FullyConnected(data=inputs, weight=self._iW, bias=self._iB,
-                                    num_hidden=self._num_hidden*4,
-                                    name='%si2h'%name)
-        h2h = symbol.FullyConnected(data=states[0], weight=self._hW, bias=self._hB,
-                                    num_hidden=self._num_hidden*4,
-                                    name='%sh2h'%name)
-        gates = i2h + h2h
-        slice_gates = symbol.SliceChannel(gates, num_outputs=4,
-                                          name="%sslice"%name)
-        in_gate = symbol.Activation(slice_gates[0], act_type="sigmoid",
-                                    name='%si'%name)
-        forget_gate = symbol.Activation(slice_gates[1], act_type="sigmoid",
-                                        name='%sf'%name)
-        in_transform = symbol.Activation(slice_gates[2], act_type="tanh",
-                                         name='%sc'%name)
-        out_gate = symbol.Activation(slice_gates[3], act_type="sigmoid",
-                                     name='%so'%name)
-        next_c = symbol._internal._plus(forget_gate * states[1], in_gate * in_transform,
-                                        name='%sstate'%name)
-        next_h = symbol._internal._mul(out_gate, symbol.Activation(next_c, act_type="tanh"),
-                                       name='%sout'%name)
-
-        return next_h, [next_h, next_c]
-
-
-class GRUCell(BaseRNNCell):
-    """Gated Rectified Unit (GRU) network cell.
-    Note: this is an implementation of the cuDNN version of GRUs
-    (slight modification compared to Cho et al. 2014).
-
-    Parameters
-    ----------
-    num_hidden : int
-        number of units in output symbol
-    prefix : str, default 'gru_'
-        prefix for name of layers
-        (and name of weight if params is None)
-    params : RNNParams or None
-        container for weight sharing between cells.
-        created if None.
-    """
-    def __init__(self, num_hidden, prefix='gru_', params=None):
-        super(GRUCell, self).__init__(prefix=prefix, params=params)
-        self._num_hidden = num_hidden
-        self._iW = self.params.get("i2h_weight")
-        self._iB = self.params.get("i2h_bias")
-        self._hW = self.params.get("h2h_weight")
-        self._hB = self.params.get("h2h_bias")
-
-    @property
-    def state_shape(self):
-        return [(0, self._num_hidden)]
-
-    @property
-    def _gate_names(self):
-        return ['_r', '_z', '_o']
-
-    def __call__(self, inputs, states):
-        # pylint: disable=too-many-locals
-        self._counter += 1
-
-        seq_idx = self._counter
-        name = '%st%d_' % (self._prefix, seq_idx)
-        prev_state_h = states[0]
-
-        i2h = symbol.FullyConnected(data=inputs,
-                                    weight=self._iW,
-                                    bias=self._iB,
-                                    num_hidden=self._num_hidden * 3,
-                                    name="%s_i2h" % name)
-        h2h = symbol.FullyConnected(data=prev_state_h,
-                                    weight=self._hW,
-                                    bias=self._hB,
-                                    num_hidden=self._num_hidden * 3,
-                                    name="%s_h2h" % name)
-
-        i2h_r, i2h_z, i2h = symbol.SliceChannel(i2h, num_outputs=3, name="%s_i2h_slice" % name)
-        h2h_r, h2h_z, h2h = symbol.SliceChannel(h2h, num_outputs=3, name="%s_h2h_slice" % name)
-
-        reset_gate = symbol.Activation(i2h_r + h2h_r, act_type="sigmoid",
-                                       name="%s_r_act" % name)
-        update_gate = symbol.Activation(i2h_z + h2h_z, act_type="sigmoid",
-                                        name="%s_z_act" % name)
-
-        next_h_tmp = symbol.Activation(i2h + reset_gate * h2h, act_type="tanh",
-                                       name="%s_h_act" % name)
-
-        next_h = symbol._internal._plus((1. - update_gate) * next_h_tmp, update_gate * prev_state_h,
-                                        name='%sout' % name)
-
-        return next_h, [next_h]
-
-
-class FusedRNNCell(BaseRNNCell):
-    """Fusing RNN layers across time step into one kernel.
-    Improves speed but is less flexible. Currently only
-    supported if using cuDNN on GPU.
-
-    Parameters
-    ----------
-    """
-    def __init__(self, num_hidden, num_layers=1, mode='lstm', bidirectional=False,
-                 dropout=0., get_next_state=False, forget_bias=1.0,
-                 prefix=None, params=None):
-        if prefix is None:
-            prefix = '%s_'%mode
-        super(FusedRNNCell, self).__init__(prefix=prefix, params=params)
-        self._num_hidden = num_hidden
-        self._num_layers = num_layers
-        self._mode = mode
-        self._bidirectional = bidirectional
-        self._dropout = dropout
-        self._get_next_state = get_next_state
-        self._directions = ['l', 'r'] if bidirectional else ['l']
-
-        initializer = init.FusedRNN(None, num_hidden, num_layers, mode,
-                                    bidirectional, forget_bias)
-        self._parameter = self.params.get('parameters', init=initializer)
-
-    @property
-    def state_shape(self):
-        b = self._bidirectional + 1
-        n = (self._mode == 'lstm') + 1
-        return [(b*self._num_layers, 0, self._num_hidden)]*n
-
-    @property
-    def _gate_names(self):
-        return {'rnn_relu': [''],
-                'rnn_tanh': [''],
-                'lstm': ['_i', '_f', '_c', '_o'],
-                'gru': ['_r', '_z', '_o']}[self._mode]
-
-    @property
-    def _num_gates(self):
-        return len(self._gate_names)
-
-    def _slice_weights(self, arr, li, lh):
-        """slice fused rnn weights"""
-        args = {}
-        gate_names = self._gate_names
-        directions = self._directions
-
-        b = len(directions)
-        p = 0
-        for layer in range(self._num_layers):
-            for direction in directions:
-                for gate in gate_names:
-                    name = '%s%s%d_i2h%s_weight'%(self._prefix, direction, layer, gate)
-                    if layer > 0:
-                        size = b*lh*lh
-                        args[name] = arr[p:p+size].reshape((lh, b*lh))
-                    else:
-                        size = li*lh
-                        args[name] = arr[p:p+size].reshape((lh, li))
-                    p += size
-                for gate in gate_names:
-                    name = '%s%s%d_h2h%s_weight'%(self._prefix, direction, layer, gate)
-                    size = lh**2
-                    args[name] = arr[p:p+size].reshape((lh, lh))
-                    p += size
-
-        for layer in range(self._num_layers):
-            for direction in directions:
-                for gate in gate_names:
-                    name = '%s%s%d_i2h%s_bias'%(self._prefix, direction, layer, gate)
-                    args[name] = arr[p:p+lh]
-                    p += lh
-                for gate in gate_names:
-                    name = '%s%s%d_h2h%s_bias'%(self._prefix, direction, layer, gate)
-                    args[name] = arr[p:p+lh]
-                    p += lh
-
-        assert p == arr.size, "Invalid parameters size for FusedRNNCell"
-        return args
-
-    def unpack_weights(self, args):
-        args = args.copy()
-        arr = args.pop(self._parameter.name)
-        b = len(self._directions)
-        m = self._num_gates
-        h = self._num_hidden
-        num_input = arr.size//b//h//m - (self._num_layers - 1)*(h+b*h+2) - h - 2
-
-        nargs = self._slice_weights(arr, num_input, self._num_hidden)
-        args.update({name: nd.copy() for name, nd in nargs.items()})
-        return args
-
-    def pack_weights(self, args):
-        args = args.copy()
-        b = self._bidirectional + 1
-        m = self._num_gates
-        c = self._gate_names
-        h = self._num_hidden
-        w0 = args['%sl0_i2h%s_weight'%(self._prefix, c[0])]
-        num_input = w0.shape[1]
-        total = (num_input+h+2)*h*m*b + (self._num_layers-1)*m*h*(h+b*h+2)*b
-
-        arr = ndarray.zeros((total,), ctx=w0.context, dtype=w0.dtype)
-        for name, nd in self._slice_weights(arr, num_input, h).items():
-            nd[:] = args.pop(name)
-        args[self._parameter.name] = arr
-        return args
-
-    def __call__(self, inputs, states):
-        raise NotImplementedError("FusedRNNCell cannot be stepped. Please use unroll")
-
-    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
-        self.reset()
-
-        inputs, axis = _normalize_sequence(length, inputs, layout, True)
-        if axis == 1:
-            warnings.warn("NTC layout detected. Consider using "
-                          "TNC for FusedRNNCell for faster speed")
-            inputs = symbol.swapaxes(inputs, dim1=0, dim2=1)
-        else:
-            assert axis == 0, "Unsupported layout %s"%layout
-        if begin_state is None:
-            begin_state = self.begin_state()
-
-        states = begin_state
-        if self._mode == 'lstm':
-            states = {'state': states[0], 'state_cell': states[1]} # pylint: disable=redefined-variable-type
-        else:
-            states = {'state': states[0]}
-
-        rnn = symbol.RNN(data=inputs, parameters=self._parameter,
-                         state_size=self._num_hidden, num_layers=self._num_layers,
-                         bidirectional=self._bidirectional, p=self._dropout,
-                         state_outputs=self._get_next_state,
-                         mode=self._mode, name=self._prefix+'rnn',
-                         **states)
-
-        if not self._get_next_state:
-            outputs, states = rnn, []
-        elif self._mode == 'lstm':
-            outputs, states = rnn[0], [rnn[1], rnn[2]]
-        else:
-            outputs, states = rnn[0], [rnn[1]]
-
-        if axis == 1:
-            outputs = symbol.swapaxes(outputs, dim1=0, dim2=1)
-
-        outputs, _ = _normalize_sequence(length, outputs, layout, merge_outputs)
-
-        return outputs, states
-
-    def unfuse(self):
-        """Unfuse the fused RNN in to a stack of rnn cells.
-
-        Returns
-        -------
-        cell : SequentialRNNCell
-            unfused cell that can be used for stepping, and can run on CPU.
-        """
-        stack = SequentialRNNCell()
-        get_cell = {'rnn_relu': lambda cell_prefix: RNNCell(self._num_hidden,
-                                                            activation='relu',
-                                                            prefix=cell_prefix),
-                    'rnn_tanh': lambda cell_prefix: RNNCell(self._num_hidden,
-                                                            activation='tanh',
-                                                            prefix=cell_prefix),
-                    'lstm': lambda cell_prefix: LSTMCell(self._num_hidden,
-                                                         prefix=cell_prefix),
-                    'gru': lambda cell_prefix: GRUCell(self._num_hidden,
-                                                       prefix=cell_prefix)}[self._mode]
-        for i in range(self._num_layers):
-            if self._bidirectional:
-                stack.add(BidirectionalCell(
-                    get_cell('%sl%d_'%(self._prefix, i)),
-                    get_cell('%sr%d_'%(self._prefix, i)),
-                    output_prefix='%sbi_l%d_'%(self._prefix, i)))
-            else:
-                stack.add(get_cell('%sl%d_'%(self._prefix, i)))
-
-            if self._dropout > 0 and i != self._num_layers - 1:
-                stack.add(DropoutCell(self._dropout, prefix='%s_dropout%d_'%(self._prefix, i)))
-
-        return stack
-
-
-class SequentialRNNCell(BaseRNNCell):
-    """Sequantially stacking multiple RNN cells
-
-    Parameters
-    ----------
-    params : RNNParams or None
-        container for weight sharing between cells.
-        created if None.
-    """
-    def __init__(self, params=None):
-        super(SequentialRNNCell, self).__init__(prefix='', params=params)
-        self._override_cell_params = params is not None
-        self._cells = []
-
-    def add(self, cell):
-        """Append a cell into the stack.
-
-        Parameters
-        ----------
-        cell : rnn cell
-        """
-        self._cells.append(cell)
-        if self._override_cell_params:
-            assert cell._own_params, \
-                "Either specify params for SequentialRNNCell " \
-                "or child cells, not both."
-            cell.params._params.update(self.params._params)
-        self.params._params.update(cell.params._params)
-
-    @property
-    def state_shape(self):
-        return _cells_state_shape(self._cells)
-
-    def begin_state(self, **kwargs):
-        assert not self._modified, \
-            "After applying modifier cells (e.g. ZoneoutCell) the base " \
-            "cell cannot be called directly. Call the modifier cell instead."
-        return _cells_begin_state(self._cells, **kwargs)
-
-    def unpack_weights(self, args):
-        return _cells_unpack_weights(self._cells, args)
-
-    def pack_weights(self, args):
-        return _cells_pack_weights(self._cells, args)
-
-    def __call__(self, inputs, states):
-        self._counter += 1
-        next_states = []
-        p = 0
-        for cell in self._cells:
-            assert not isinstance(cell, BidirectionalCell)
-            n = len(cell.state_shape)
-            state = states[p:p+n]
-            p += n
-            inputs, state = cell(inputs, state)
-            next_states.append(state)
-        return inputs, sum(next_states, [])
-
-    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
-        self.reset()
-
-        num_cells = len(self._cells)
-        if begin_state is None:
-            begin_state = self.begin_state()
-
-        p = 0
-        next_states = []
-        for i, cell in enumerate(self._cells):
-            n = len(cell.state_shape)
-            states = begin_state[p:p+n]
-            p += n
-            inputs, states = cell.unroll(length, inputs=inputs, begin_state=states, layout=layout,
-                                         merge_outputs=None if i < num_cells-1 else merge_outputs)
-            next_states.extend(states)
-
-        return inputs, next_states
-
-
-class DropoutCell(BaseRNNCell):
-    """Apply dropout on input.
-
-    Parameters
-    ----------
-    dropout : float
-        percentage of elements to drop out, which
-        is 1 - percentage to retain.
-    """
-    def __init__(self, dropout, prefix='dropout_', params=None):
-        super(DropoutCell, self).__init__(prefix, params)
-        assert isinstance(dropout, numeric_types), "dropout probability must be a number"
-        self.dropout = dropout
-
-    @property
-    def state_shape(self):
-        return []
-
-    def __call__(self, inputs, states):
-        if self.dropout > 0:
-            inputs = symbol.Dropout(data=inputs, p=self.dropout)
-        return inputs, states
-
-    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
-        self.reset()
-        inputs, _ = _normalize_sequence(length, inputs, layout, merge_outputs)
-        if isinstance(inputs, symbol.Symbol):
-            return self(inputs, [])
-        else:
-            return super(DropoutCell, self).unroll(
-                length, inputs, begin_state=begin_state, layout=layout,
-                merge_outputs=merge_outputs)
-
-
-class ModifierCell(BaseRNNCell):
-    """Base class for modifier cells. A modifier
-    cell takes a base cell, apply modifications
-    on it (e.g. Zoneout), and returns a new cell.
-
-    After applying modifiers the base cell should
-    no longer be called directly. The modifer cell
-    should be used instead.
-    """
-    def __init__(self, base_cell):
-        super(ModifierCell, self).__init__()
-        base_cell._modified = True
-        self.base_cell = base_cell
-
-    @property
-    def params(self):
-        self._own_params = False
-        return self.base_cell.params
-
-    @property
-    def state_shape(self):
-        return self.base_cell.state_shape
-
-    def begin_state(self, init_sym=symbol.zeros, **kwargs):
-        assert not self._modified, \
-            "After applying modifier cells (e.g. DropoutCell) the base " \
-            "cell cannot be called directly. Call the modifier cell instead."
-        self.base_cell._modified = False
-        begin = self.base_cell.begin_state(init_sym, **kwargs)
-        self.base_cell._modified = True
-        return begin
-
-    def unpack_weights(self, args):
-        return self.base_cell.unpack_weights(args)
-
-    def pack_weights(self, args):
-        return self.base_cell.pack_weights(args)
-
-    def __call__(self, inputs, states):
-        raise NotImplementedError
-
-
-class ZoneoutCell(ModifierCell):
-    """Apply Zoneout on base cell"""
-    def __init__(self, base_cell, zoneout_outputs=0., zoneout_states=0.):
-        assert not isinstance(base_cell, FusedRNNCell), \
-            "FusedRNNCell doesn't support zoneout. " \
-            "Please unfuse first."
-        assert not isinstance(base_cell, BidirectionalCell), \
-            "BidirectionalCell doesn't support zoneout since it doesn't support step. " \
-            "Please add ZoneoutCell to the cells underneath instead."
-        assert not isinstance(base_cell, SequentialRNNCell) or not base_cell._bidirectional, \
-            "Bidirectional SequentialRNNCell doesn't support zoneout. " \
-            "Please add ZoneoutCell to the cells underneath instead."
-        super(ZoneoutCell, self).__init__(base_cell)
-        self.zoneout_outputs = zoneout_outputs
-        self.zoneout_states = zoneout_states
-        self.prev_output = None
-
-    def reset(self):
-        super(ZoneoutCell, self).reset()
-        self.prev_output = None
-
-    def __call__(self, inputs, states):
-        cell, p_outputs, p_states = self.base_cell, self.zoneout_outputs, self.zoneout_states
-        next_output, next_states = cell(inputs, states)
-        mask = (lambda p, like:
-                symbol.Dropout(_symbol_internal._identity_with_attr_like_rhs(symbol.ones((0, 0)),
-                                                                             like),
-                               p=p))
-
-        prev_output = self.prev_output if self.prev_output else symbol.zeros((0, 0))
-
-        output = (symbol.where(mask(p_outputs, next_output), next_output, prev_output)
-                  if p_outputs != 0. else next_output)
-        states = ([symbol.where(mask(p_states, new_s), new_s, old_s) for new_s, old_s in
-                   zip(next_states, states)] if p_states != 0. else next_states)
-
-        self.prev_output = output
-
-        return output, states
-
-
-
-class BidirectionalCell(BaseRNNCell):
-    """Bidirectional RNN cell
-
-    Parameters
-    ----------
-    l_cell : BaseRNNCell
-        cell for forward unrolling
-    r_cell : BaseRNNCell
-        cell for backward unrolling
-    output_prefix : str, default 'bi_'
-        prefix for name of output
-    """
-    def __init__(self, l_cell, r_cell, params=None, output_prefix='bi_'):
-        super(BidirectionalCell, self).__init__('', params=params)
-        self._override_cell_params = params is not None
-        self._cells = [l_cell, r_cell]
-        self._output_prefix = output_prefix
-
-    def unpack_weights(self, args):
-        return _cells_unpack_weights(self._cells, args)
-
-    def pack_weights(self, args):
-        return _cells_pack_weights(self._cells, args)
-
-    def __call__(self, inputs, states):
-        raise NotImplementedError("Bidirectional cannot be stepped. Please use unroll")
-
-    @property
-    def state_shape(self):
-        return _cells_state_shape(self._cells)
-
-    def begin_state(self, **kwargs):
-        assert not self._modified, \
-            "After applying modifier cells (e.g. DropoutCell) the base " \
-            "cell cannot be called directly. Call the modifier cell instead."
-        return _cells_begin_state(self._cells, **kwargs)
-
-    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
-        self.reset()
-
-        inputs, axis = _normalize_sequence(length, inputs, layout, False)
-        if begin_state is None:
-            begin_state = self.begin_state()
-
-        states = begin_state
-        l_cell, r_cell = self._cells
-        l_outputs, l_states = l_cell.unroll(length, inputs=inputs,
-                                            begin_state=states[:len(l_cell.state_shape)],
-                                            layout=layout, merge_outputs=merge_outputs)
-        r_outputs, r_states = r_cell.unroll(length,
-                                            inputs=list(reversed(inputs)),
-                                            begin_state=states[len(l_cell.state_shape):],
-                                            layout=layout, merge_outputs=merge_outputs)
-
-        if merge_outputs is None:
-            merge_outputs = (isinstance(l_outputs, symbol.Symbol)
-                             and isinstance(r_outputs, symbol.Symbol))
-            if not merge_outputs:
-                if isinstance(l_outputs, symbol.Symbol):
-                    l_outputs = list(symbol.SliceChannel(l_outputs, axis=axis,
-                                                         num_outputs=length, squeeze_axis=1))
-                if isinstance(r_outputs, symbol.Symbol):
-                    r_outputs = list(symbol.SliceChannel(r_outputs, axis=axis,
-                                                         num_outputs=length, squeeze_axis=1))
-
-        if merge_outputs:
-            l_outputs = [l_outputs]
-            r_outputs = [symbol.reverse(r_outputs, axis=axis)]
-        else:
-            r_outputs = list(reversed(r_outputs))
-
-        outputs = [symbol.Concat(l_o, r_o, dim=1+merge_outputs,
-                                 name=('%sout'%(self._output_prefix) if merge_outputs
-                                       else '%st%d'%(self._output_prefix, i)))
-                   for i, l_o, r_o in
-                   zip(range(len(l_outputs)), l_outputs, r_outputs)]
-
-        if merge_outputs:
-            outputs = outputs[0]
-
-        states = [l_states, r_states]
-        return outputs, states
diff --git a/seq2seq/seq2seq_bucketing.py b/seq2seq/seq2seq_bucketing.py
deleted file mode 100755
index ef7b471..0000000
--- a/seq2seq/seq2seq_bucketing.py
+++ /dev/null
@@ -1,498 +0,0 @@
-import numpy as np
-import mxnet as mx
-import argparse
-import cPickle as pickle
-#import dill as pickle
-import math
-import nltk
-
-from mxnet.rnn import LSTMCell, SequentialRNNCell, FusedRNNCell
-#from rnn_cell import LSTMCell, SequentialRNNCell
-from itertools import takewhile, dropwhile
-
-from time import time
-import re
-from unidecode import unidecode
-
-from utils import array_to_text, tokenize_text, invert_dict, get_s2s_data, Dataset
-
-from seq2seq_iterator import *
-
-from attention_cell import AttentionEncoderCell, DotAttentionCell
-
-parser = argparse.ArgumentParser(description="Train RNN on Penn Tree Bank",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--infer', default=False, action='store_true',
-                    help='whether to do inference instead of training')
-parser.add_argument('--model-prefix', type=str, default=None,
-                    help='path to save/load model')
-parser.add_argument('--load-epoch', type=int, default=0,
-                    help='load from epoch')
-parser.add_argument('--num-layers', type=int, default=2,
-                    help='number of stacked RNN layers')
-parser.add_argument('--num-hidden', type=int, default=200,
-                    help='hidden layer size')
-parser.add_argument('--num-embed', type=int, default=200,
-                    help='embedding layer size')
-parser.add_argument('--bidirectional', type=bool, default=False,
-                    help='whether to use bidirectional layers')
-parser.add_argument('--gpus', type=str,
-                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \
-                         'Increase batch size when using multiple gpus for best performance.')
-parser.add_argument('--kv-store', type=str, default='device',
-                    help='key-value store type')
-parser.add_argument('--num-epochs', type=int, default=25,
-                    help='max num of epochs')
-parser.add_argument('--lr', type=float, default=0.01,
-                    help='initial learning rate')
-parser.add_argument('--optimizer', type=str, default='sgd',
-                    help='the optimizer type')
-parser.add_argument('--mom', type=float, default=0.0,
-                    help='momentum for sgd')
-parser.add_argument('--wd', type=float, default=0.00001,
-                    help='weight decay for sgd')
-parser.add_argument('--batch-size', type=int, default=32,
-                    help='the batch size.')
-parser.add_argument('--disp-batches', type=int, default=50,
-                    help='show progress for every n batches')
-parser.add_argument('--max-grad-norm', type=float, default=5.0,
-                    help='maximum gradient norm (larger values will be clipped')
-# When training a deep, complex model, it's recommended to stack fused RNN cells (one
-# layer per cell) together instead of one with all layers. The reason is that fused RNN
-# cells doesn't set gradients to be ready until the computation for the entire layer is
-# completed. Breaking a multi-layer fused RNN cell into several one-layer ones allows
-# gradients to be processed ealier. This reduces communication overhead, especially with
-# multiple GPUs.
-parser.add_argument('--stack-rnn', default=False,
-                    help='stack fused RNN cells to reduce communication overhead')
-parser.add_argument('--dropout', type=float, default='0.0',
-                    help='dropout probability (1.0 - keep probability)')
-parser.add_argument('--use-cudnn-cells', action='store_true',
-                    help='Use CUDNN LSTM (mx.rnn.FusedRNNCell) for training instead of in-graph LSTM cells (mx.rnn.LSTMCell)')
-
-#buckets = [32]
-# buckets = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
-
-start_label = 1
-invalid_label = 0
-
-reserved_tokens={'<PAD>':0, '<UNK>':1, '<EOS>':2, '<GO>':3}
-
-def print_inferred_shapes(node, arg_shapes, aux_shapes, out_shapes):
-    args = node.list_arguments()
-    aux_states = node.list_auxiliary_states()
-    outputs = node.list_outputs()
-    print("\n================================================")
-    print("\nNODE: %s" % node.name)
-    print("\n============")
-    print("args:")
-    print("============")
-    if len(arg_shapes) == 0:
-        print("N/A")
-    for i in range(len(arg_shapes)):
-        print("%s: %s" % (args[i], arg_shapes[i]))
-    print("\n=============")
-    print("aux_states:")
-    print("=============")
-    if len(aux_shapes) == 0:
-        print("N/A")
-    for i in range(len(aux_states)):
-        print("%s: %s" % (aux_states[i], aux_shapes[i]))
-    print("\n=============")
-    print("outputs:")
-    print("==============")
-    if len(out_shapes) == 0:
-        print("N/A")
-    for i in range(len(outputs)):
-        print("%s: %s" % (outputs[i], out_shapes[i]))
-    print("\n================================================")
-    print("\n")
-
-def _normalize_sequence(length, inputs, layout, merge, in_layout=None):
-    from mxnet import symbol, init, ndarray, _symbol_internal
-
-    assert inputs is not None, \
-        "unroll(inputs=None) has been deprecated. " \
-        "Please create input variables outside unroll."
-
-    axis = layout.find('T')
-    in_axis = in_layout.find('T') if in_layout is not None else axis
-    if isinstance(inputs, symbol.Symbol):
-        if merge is False:
-            assert len(inputs.list_outputs()) == 1, \
-                "unroll doesn't allow grouped symbol as input. Please convert " \
-                "to list with list(inputs) first or let unroll handle splitting."
-            inputs = list(symbol.split(inputs, axis=in_axis, num_outputs=length,
-                                       squeeze_axis=1))
-    else: 
-        assert length is None or len(inputs) == length
-        if merge is True:
-            inputs = [symbol.expand_dims(i, axis=axis) for i in inputs]
-            inputs = symbol.Concat(*inputs, dim=axis)
-            in_axis = axis
-
-    if isinstance(inputs, symbol.Symbol) and axis != in_axis:
-        inputs = symbol.swapaxes(inputs, dim0=axis, dim1=in_axis)
-
-    return inputs, axis
-
-def get_data(layout):
-
-    start = time()
-
-    print("\nUnpickling training iterator")
-
-    with open('./data/train_iterator.pkl', 'rb') as f: # _en_de.pkl
-        train_iter = pickle.load(f)
- 
-    train_iter.initialize()
-    train_iter.batch_size = args.batch_size
-
-    print("\nUnpickling validation iterator")
-
-    with open('./data/valid_iterator.pkl', 'rb') as f: # _en_de.pkl
-        valid_iter = pickle.load(f)
- 
-    valid_iter.initialize()
-    valid_iter.batch_size = args.batch_size
-
-    print("\nEncoded source language sentences:\n")
-    for i in range(5):
-        print(array_to_text(train_iter.src_sent[i], train_iter.inv_src_vocab))
-
-    print("\nEncoded target language sentences:\n")
-    for i in range(5):
-        print(array_to_text(valid_iter.targ_sent[i], train_iter.inv_targ_vocab))
-    
-    duration = time() - start
-
-    print("\nDataset deserialization time: %.2f seconds\n" % duration)
-
-    return train_iter, valid_iter, train_iter.src_vocab, train_iter.targ_vocab
-
-# WORK IN PROGRESS !!!
-def decoder_unroll(decoder, target_embed, targ_vocab, unroll_length, go_symbol, begin_state=None, layout='TNC', merge_outputs=None):
-
-        decoder.reset()
-
-        if begin_state is None:
-            begin_state = decoder.begin_state()
-
-        inputs, _ = _normalize_sequence(unroll_length, target_embed, layout, False)
-
-        # Need to use hidden state from attention model, but <GO> as input
-        states = begin_state
-        outputs = []
-
-        embed = inputs[0]
-
-        # NEW 1
-#        fc_weight = mx.sym.Variable('fc_weight')
-#        fc_bias = mx.sym.Variable('fc_bias')
-#        em_weight = mx.sym.Variable('em_weight')
-#        for i in range(0, unroll_length):
-#            output, states = decoder(embed, states)
-#            outputs.append(embed)
-#            fc = mx.sym.FullyConnected(data=output, weight=fc_weight, bias=fc_bias, num_hidden=len(targ_vocab), name='decoder_fc%d_'%i)
-#            am = mx.sym.argmax(data=fc, axis=1)
-#            embed = mx.sym.Embedding(data=am, weight=em_weight, input_dim=len(targ_vocab),
-#                output_dim=args.num_embed, name='decoder_embed%d_'%i)
-
-        # NEW 2
-        for i in range(0, unroll_length):
-            embed, states = decoder(embed, states)
-            outputs.append(embed)
-
-        outputs, _ = _normalize_sequence(unroll_length, outputs, layout, merge_outputs)
-
-        return outputs, states
-
-def train(args):
-
-    from time import time
-
-    data_train, data_val, src_vocab, targ_vocab = get_data('TN')
-    print "len(src_vocab) len(targ_vocab)", len(src_vocab), len(targ_vocab)
-
-    encoder = SequentialRNNCell()
-
-    if args.use_cudnn_cells:
-        encoder.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout,
-            mode='lstm', prefix='lstm_encoder', bidirectional=args.bidirectional, get_next_state=True))
-    else:
-        for i in range(args.num_layers):
-            encoder.add(LSTMCell(args.num_hidden, prefix='rnn_encoder%d_' % i))
-            if i < args.num_layers - 1 and args.dropout > 0.0:
-                encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i))
-    encoder.add(AttentionEncoderCell())
-
-    decoder = mx.rnn.SequentialRNNCell()
-
-    if args.use_cudnn_cells:
-        decoder.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, 
-            mode='lstm', prefix='lstm_decoder', bidirectional=args.bidirectional, get_next_state=True))
-    else:
-        for i in range(args.num_layers):
-            decoder.add(LSTMCell(args.num_hidden, prefix=('rnn_decoder%d_' % i)))
-            if i < args.num_layers - 1 and args.dropout > 0.0:
-                decoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_decoder%d_' % i))
-    decoder.add(DotAttentionCell())
-
-    def sym_gen(seq_len):
-        src_data = mx.sym.Variable('src_data')
-        targ_data = mx.sym.Variable('targ_data')
-        label = mx.sym.Variable('softmax_label')
- 
-        src_embed = mx.sym.Embedding(data=src_data, input_dim=len(src_vocab), 
-                                 output_dim=args.num_embed, name='src_embed') 
-        targ_embed = mx.sym.Embedding(data=targ_data, input_dim=len(targ_vocab),    # data=data
-                                 output_dim=args.num_embed, name='targ_embed')
-
-        encoder.reset()
-        decoder.reset()
-
-        enc_seq_len, dec_seq_len = seq_len
-
-        layout = 'TNC'
-        _, states = encoder.unroll(enc_seq_len, inputs=src_embed, layout=layout)
-
-        # This should be based on EOS or max seq len for inference, but here we unroll to the target length
-        # TODO: fix <GO> symbol
-        outputs, _ = decoder.unroll(dec_seq_len, targ_embed, begin_state=states, layout=layout, merge_outputs=True)
-#        outputs, _ = decoder_unroll(decoder, targ_embed, targ_vocab, dec_seq_len, 0, begin_state=states, layout='TNC', merge_outputs=True)
-
-        # NEW
-        rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1')
-        fc = mx.sym.FullyConnected(data=rs, num_hidden=len(targ_vocab), name='sym_gen_fc')
-        label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2')
-        pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax')
-
-        return pred, ('src_data', 'targ_data',), ('softmax_label',)
-
-
-#    foo, _, _ = sym_gen((1, 1))
-#    print(type(foo))
-#    mx.viz.plot_network(symbol=foo).save('./seq2seq.dot')
-
-
-    if args.gpus:
-        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
-    else:
-        contexts = mx.cpu(0)
-
-    model = mx.mod.BucketingModule( 
-        sym_gen             = sym_gen,
-        default_bucket_key  = data_train.default_bucket_key,
-        context             = contexts)
-
-    if args.load_epoch:
-        _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(
-            cell, args.model_prefix, args.load_epoch)
-    else:
-        arg_params = None
-        aux_params = None
-
-    opt_params = {
-      'learning_rate': args.lr,
-      'wd': args.wd
-    }
-
-    if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']:
-        opt_params['momentum'] = args.mom
-
-    opt_params['clip_gradient'] = args.max_grad_norm
-
-    start = time()
-
-    model.fit(
-        train_data          = data_train,
-        eval_data           = data_val,
-        eval_metric         = mx.metric.Perplexity(invalid_label),
-        kvstore             = args.kv_store,
-        optimizer           = args.optimizer,
-        optimizer_params    = opt_params, 
-        initializer         = mx.init.Xavier(factor_type="in", magnitude=2.34),
-        arg_params          = arg_params,
-        aux_params          = aux_params,
-        begin_epoch         = args.load_epoch,
-        num_epoch           = args.num_epochs,
-        batch_end_callback  = mx.callback.Speedometer(batch_size=args.batch_size, frequent=args.disp_batches, auto_reset=True),
-        epoch_end_callback  = mx.rnn.do_rnn_checkpoint(decoder, args.model_prefix, 1)
-                              if args.model_prefix else None)
-
-    train_duration = time() - start
-    time_per_epoch = train_duration / args.num_epochs
-    print("\n\nTime per epoch: %.2f seconds\n\n" % time_per_epoch)
-
-class BleuScore(mx.metric.EvalMetric):
-    def __init__(self, ignore_label, axis=-1):
-        super(BleuScore, self).__init__('BleuScore')
-        self.ignore_label = ignore_label
-        self.axis = axis
-
-    def update(self, labels, preds):
-        assert len(labels) == len(preds)
-
-        def drop_sentinels(text_lst):
-            sentinels = lambda x: x == reserved_tokens['<PAD>'] or x == reserved_tokens['<GO>']
-            text_lst = dropwhile(lambda x: sentinels(x), text_lst)
-            text_lst = takewhile(lambda x: not sentinels(x) and x != reserved_tokens['<EOS>'], text_lst)
-            return list(text_lst)
-
-        smoothing_fn = nltk.translate.bleu_score.SmoothingFunction().method3
-
-        for label, pred in zip(labels, preds):
-            maxed = mx.ndarray.argmax(data=pred, axis=1)
-            pred_nparr = maxed.asnumpy()
-            label_nparr = label.asnumpy().astype(np.int32) 
-            sent_len, batch_size = np.shape(label_nparr)
-            pred_nparr = pred_nparr.reshape(sent_len, batch_size).astype(np.int32)
-
-            for i in range(batch_size):
-                exp_lst = drop_sentinels(label_nparr[:, i].tolist())
-                act_lst = drop_sentinels(pred_nparr[:, i].tolist())
-                expected = exp_lst
-                actual = act_lst
-                bleu = nltk.translate.bleu_score.sentence_bleu(
-                    references=[expected], hypothesis=actual, weights=(0.25, 0.25, 0.25, 0.25),
-                    smoothing_function = smoothing_fn 
-                )
-#                print("bleu: %f" % bleu)
-                self.sum_metric += bleu
-                self.num_inst += 1
-            assert label.size == pred.size/pred.shape[-1], \
-                "shape mismatch: %s vs. %s"%(label.shape, pred.shape)
-
-    def get(self):
-        num = self.num_inst if self.num_inst > 0 else float('nan')
-        return (self.name, self.sum_metric/num)
-
-
-def infer(args):
-    assert args.model_prefix, "Must specifiy path to load from"
-
-    data_train, data_val, src_vocab, targ_vocab = get_data('TN')
-
-    print "len(src_vocab) len(targ_vocab)", len(src_vocab), len(targ_vocab)
-
-    if args.use_cudnn_cells:
-        encoder = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout,
-            mode='lstm', prefix='lstm_encoder', bidirectional=args.bidirectional, get_next_state=True).unfuse()
-
-    else:
-        encoder = SequentialRNNCell()
-
-        for i in range(args.num_layers):
-            encoder.add(LSTMCell(args.num_hidden, prefix='rnn_encoder%d_' % i))
-            if i < args.num_layers - 1 and args.dropout > 0.0:
-                encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i))
-
-    encoder.add(AttentionEncoderCell())
-
-    if args.use_cudnn_cells:
-        decoder = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, 
-            mode='lstm', prefix='lstm_decoder', bidirectional=args.bidirectional, get_next_state=True).unfuse()
- 
-    else:
-        decoder = mx.rnn.SequentialRNNCell()
-
-        for i in range(args.num_layers):
-            decoder.add(LSTMCell(args.num_hidden, prefix=('rnn_decoder%d_' % i)))
-            if i < args.num_layers - 1 and args.dropout > 0.0:
-                decoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_decoder%d_' % i))
-
-    decoder.add(DotAttentionCell())
-
-    def sym_gen(seq_len):
-        src_data = mx.sym.Variable('src_data')
-        targ_data = mx.sym.Variable('targ_data')
-        label = mx.sym.Variable('softmax_label')
- 
-        src_embed = mx.sym.Embedding(data=src_data, input_dim=len(src_vocab), 
-                                 output_dim=args.num_embed, name='src_embed') 
-        targ_embed = mx.sym.Embedding(data=targ_data, input_dim=len(targ_vocab),    # data=data
-                                 output_dim=args.num_embed, name='targ_embed')
-
-        encoder.reset()
-        decoder.reset()
-
-        enc_seq_len, dec_seq_len = seq_len
-
-        layout = 'TNC'
-        _, states = encoder.unroll(enc_seq_len, inputs=src_embed, layout=layout)
-
-        # This should be based on EOS or max seq len for inference, but here we unroll to the target length
-        # TODO: fix <GO> symbol
-#        outputs, _ = decoder.unroll(dec_seq_len, targ_embed, begin_state=states, layout=layout, merge_outputs=True)
-        outputs, _ = decoder_unroll(decoder, targ_embed, targ_vocab, dec_seq_len, 0, begin_state=states, layout='TNC', merge_outputs=True)
-
-        # NEW
-        rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1')
-        fc = mx.sym.FullyConnected(data=rs, num_hidden=len(targ_vocab), name='sym_gen_fc')
-        label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2')
-        pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax')
-
-        return pred, ('src_data', 'targ_data',), ('softmax_label',)
-
-    if args.gpus:
-        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
-    else:
-        contexts = mx.cpu(0)
-
-    model = mx.mod.BucketingModule( 
-        sym_gen             = sym_gen,
-        default_bucket_key  = data_train.default_bucket_key,
-        context             = contexts)
-
-    model.bind(data_val.provide_data, data_val.provide_label, for_training=False)
-
-    if args.load_epoch:
-        _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(
-            decoder, args.model_prefix, args.load_epoch)
-        model.set_params(arg_params, aux_params)
-
-    else:
-        arg_params = None
-        aux_params = None
-
-    opt_params = {
-      'learning_rate': args.lr,
-      'wd': args.wd
-    }
-
-    if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']:
-        opt_params['momentum'] = args.mom
-
-    opt_params['clip_gradient'] = args.max_grad_norm
-
-    start = time()
-
-    # mx.metric.Perplexity
-    model.score(data_val, BleuScore(invalid_label), #PPL(invalid_label),
-                batch_end_callback=mx.callback.Speedometer(batch_size=args.batch_size, frequent=5, auto_reset=True))
-
-    infer_duration = time() - start
-    time_per_epoch = infer_duration / args.num_epochs
-    print("\n\nTime per epoch: %.2f seconds\n\n" % time_per_epoch)
-
-if __name__ == '__main__':
-    import logging
-    head = '%(asctime)-15s %(message)s'
-    logging.basicConfig(level=logging.DEBUG, format=head)
-
-    args = parser.parse_args()
-    if args.gpus:
-        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
-    else:
-        contexts = mx.cpu(0)
-    
-
-    if args.num_layers >= 4 and len(args.gpus.split(',')) >= 4 and not args.stack_rnn:
-        print('WARNING: stack-rnn is recommended to train complex model on multiple GPUs')
-
-    if args.infer:
-        # Demonstrates how to load a model trained with CuDNN RNN and predict
-        # with non-fused MXNet symbol
-        infer(args)
-    else:
-        train(args)
diff --git a/seq2seq_bucketing.py b/seq2seq_bucketing.py
new file mode 100755
index 0000000..e7bbd1a
--- /dev/null
+++ b/seq2seq_bucketing.py
@@ -0,0 +1,737 @@
+import numpy as np
+import mxnet as mx
+import argparse
+import cPickle as pickle
+#import dill as pickle
+import math
+import nltk
+
+from mxnet.rnn import LSTMCell, SequentialRNNCell, FusedRNNCell, BidirectionalCell
+#from rnn_cell import LSTMCell, SequentialRNNCell
+from itertools import takewhile, dropwhile
+from operator import itemgetter
+
+from time import time
+import re
+from unidecode import unidecode
+
+from utils import array_to_text, tokenize_text, invert_dict, get_s2s_data, Dataset
+
+from seq2seq_iterator import *
+
+# from attention_cell import AttentionEncoderCell, DotAttentionCell
+
+parser = argparse.ArgumentParser(description="Train RNN on Penn Tree Bank",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--infer', default=False, action='store_true',
+                    help='whether to do inference instead of training')
+parser.add_argument('--model-prefix', type=str, default=None,
+                    help='path to save/load model')
+parser.add_argument('--load-epoch', type=int, default=0,
+                    help='load from epoch')
+parser.add_argument('--num-layers', type=int, default=2,
+                    help='number of stacked RNN layers')
+parser.add_argument('--num-hidden', type=int, default=200,
+                    help='hidden layer size')
+parser.add_argument('--num-embed', type=int, default=200,
+                    help='embedding layer size')
+parser.add_argument('--bidirectional', action='store_true',
+                    help='whether to use bidirectional layers')
+parser.add_argument('--gpus', type=str,
+                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \
+                         'Increase batch size when using multiple gpus for best performance.')
+parser.add_argument('--kv-store', type=str, default='device',
+                    help='key-value store type')
+parser.add_argument('--num-epochs', type=int, default=25,
+                    help='max num of epochs')
+parser.add_argument('--lr', type=float, default=0.01,
+                    help='initial learning rate')
+parser.add_argument('--optimizer', type=str, default='sgd',
+                    help='the optimizer type')
+parser.add_argument('--mom', type=float, default=0.0,
+                    help='momentum for sgd')
+parser.add_argument('--wd', type=float, default=0.00001,
+                    help='weight decay for sgd')
+parser.add_argument('--batch-size', type=int, default=32,
+                    help='the batch size.')
+parser.add_argument('--disp-batches', type=int, default=50,
+                    help='show progress for every n batches')
+parser.add_argument('--max-grad-norm', type=float, default=5.0,
+                    help='maximum gradient norm (larger values will be clipped')
+
+# When training a deep, complex model, it's recommended to stack fused RNN cells (one
+# layer per cell) together instead of one with all layers. The reason is that fused RNN
+# cells doesn't set gradients to be ready until the computation for the entire layer is
+# completed. Breaking a multi-layer fused RNN cell into several one-layer ones allows
+# gradients to be processed ealier. This reduces communication overhead, especially with
+# multiple GPUs.
+parser.add_argument('--stack-rnn', default=False,
+                    help='stack fused RNN cells to reduce communication overhead')
+parser.add_argument('--dropout', type=float, default='0.0',
+                    help='dropout probability (1.0 - keep probability)')
+parser.add_argument('--use-cudnn-cells', action='store_true',
+                    help='Use CUDNN LSTM (mx.rnn.FusedRNNCell) for training instead of in-graph LSTM cells (mx.rnn.LSTMCell)')
+
+parser.add_argument('--inference-unrolling-for-training', action='store_true',
+                    help='Feed previous prediction (instead of previous ground truth) into the decoder input during training')
+parser.add_argument('--seed', type=int, default=1234,
+                    help='Set random seed for Python, NumPy and MxNet RNGs')
+
+parser.add_argument('--remove-state-feed', action='store_true',
+                    help='Remove direct state feeding from encoder to decoder (use when using attention)')
+
+
+parser.add_argument('--input-feed', action='store_true',
+                    help='Enable input feed (attention is fed into the decoder as input, rather than concatenated with output)')
+
+parser.add_argument('--attention', action='store_true',
+                    help='Use attention (dot attention is the currently implemented form')
+
+#buckets = [32]
+# buckets = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
+
+start_label = 1
+invalid_label = 0
+
+reserved_tokens={'<PAD>':0, '<UNK>':1, '<EOS>':2, '<GO>':3}
+
+def print_inferred_shapes(node, arg_shapes, aux_shapes, out_shapes):
+    args = node.list_arguments()
+    aux_states = node.list_auxiliary_states()
+    outputs = node.list_outputs()
+    print("\n================================================")
+    print("\nNODE: %s" % node.name)
+    print("\n============")
+    print("args:")
+    print("============")
+    if len(arg_shapes) == 0:
+        print("N/A")
+    for i in range(len(arg_shapes)):
+        print("%s: %s" % (args[i], arg_shapes[i]))
+    print("\n=============")
+    print("aux_states:")
+    print("=============")
+    if len(aux_shapes) == 0:
+        print("N/A")
+    for i in range(len(aux_states)):
+        print("%s: %s" % (aux_states[i], aux_shapes[i]))
+    print("\n=============")
+    print("outputs:")
+    print("==============")
+    if len(out_shapes) == 0:
+        print("N/A")
+    for i in range(len(outputs)):
+        print("%s: %s" % (outputs[i], out_shapes[i]))
+    print("\n================================================")
+    print("\n")
+
+def _normalize_sequence(length, inputs, layout, merge, in_layout=None):
+    from mxnet import symbol, init, ndarray, _symbol_internal
+
+    assert inputs is not None, \
+        "unroll(inputs=None) has been deprecated. " \
+        "Please create input variables outside unroll."
+
+    axis = layout.find('T')
+    in_axis = in_layout.find('T') if in_layout is not None else axis
+    if isinstance(inputs, symbol.Symbol):
+        if merge is False:
+            assert len(inputs.list_outputs()) == 1, \
+                "unroll doesn't allow grouped symbol as input. Please convert " \
+                "to list with list(inputs) first or let unroll handle splitting."
+            inputs = list(symbol.split(inputs, axis=in_axis, num_outputs=length,
+                                       squeeze_axis=1))
+    else: 
+        assert length is None or len(inputs) == length
+        if merge is True:
+            inputs = [symbol.expand_dims(i, axis=axis) for i in inputs]
+            inputs = symbol.Concat(*inputs, dim=axis)
+            in_axis = axis
+
+    if isinstance(inputs, symbol.Symbol) and axis != in_axis:
+        inputs = symbol.swapaxes(inputs, dim0=axis, dim1=in_axis)
+
+    return inputs, axis
+
+def get_data(layout, infer=False):
+
+    start = time()
+
+    print("\nUnpickling training iterator")
+
+    if not infer:
+        with open('./data/train_iterator.pkl', 'rb') as f: # _en_de.pkl
+            train_iter = pickle.load(f)
+ 
+        train_iter.initialize(curr_batch_size=args.batch_size)
+
+        print("\nUnpickling validation iterator")
+
+        with open('./data/valid_iterator.pkl', 'rb') as f: # _en_de.pkl
+            valid_iter = pickle.load(f)
+ 
+        valid_iter.initialize(curr_batch_size=args.batch_size)
+
+    with open('./data/test_iterator.pkl', 'rb') as f:
+        test_iter = pickle.load(f)
+
+    test_iter.initialize(curr_batch_size=args.batch_size)
+
+#    print("\nEncoded source language sentences:\n")
+#    for i in range(5):
+#        print(array_to_text(train_iter.src_sent[i], train_iter.inv_src_vocab))
+
+#    print("\nEncoded target language sentences:\n")
+#    for i in range(5):
+#        print(array_to_text(train_iter.targ_sent[i], train_iter.inv_targ_vocab))
+    
+    duration = time() - start
+
+    print("\nDataset deserialization time: %.2f seconds\n" % duration)
+ 
+    if not infer:
+        return train_iter, valid_iter, test_iter, train_iter.src_vocab, train_iter.targ_vocab, train_iter.inv_src_vocab, train_iter.inv_targ_vocab
+    else:
+        return test_iter, test_iter.src_vocab, test_iter.inv_src_vocab, test_iter.targ_vocab, test_iter.inv_targ_vocab
+
+def attention_step(i, encoder_outputs, decoder_output):
+
+    attention_state = mx.sym.zeros_like(encoder_outputs[-1], name='train_dec_unroll_attention_state')
+    curr_att_input = mx.sym.expand_dims(decoder_output, axis=2, name='train_dec_unroll_expand_dims_%d_' % i)
+    enc_len = len(encoder_outputs)
+    dots = []
+    concat_dots = None
+    # loop over all the encoder periods to create weights for weighted state
+    for j in range(enc_len):
+        transposed = mx.sym.expand_dims(encoder_outputs[j], axis=2)
+        transposed = mx.sym.transpose(transposed, axes=(0, 2, 1), name='train_decoder_transpose%d_' % i)
+        dot = mx.sym.batch_dot(transposed, curr_att_input, name='train_decoder_batch_dot_%d_%d_' % (i, j))
+        dot = mx.sym.exp(dot, name='train_decoder_exp_%d_%d' % (i, j))
+        # The batch size shouldn't be an arg here anyway. We should just remove extra dimensions
+        # and then transpose.
+        dot = mx.sym.reshape(dot, shape=(1, args.batch_size / len(contexts)),
+                             name='train_decoder_unroll_reshape_%d_%d' % (i, j))
+        dots.append(dot)
+        if not concat_dots:
+            concat_dots = dot
+        else:
+            concat_dots = mx.sym.concat(concat_dots, dot)
+    dot_sum = mx.sym.sum(concat_dots, axis=1)
+    for j in range(enc_len):
+        curr_dot = mx.sym.transpose(dots[j])
+        attention_state += mx.sym.broadcast_mul(curr_dot, encoder_outputs[j],
+                                                name='train_encoder_acc_attention_%d_%d_' % (i, j))
+
+    attention_state = mx.sym.broadcast_div(attention_state, dot_sum)
+
+    return attention_state
+
+
+def train_decoder_unroll(decoder, encoder_outputs, target_embed, targ_vocab, unroll_length,
+                         go_symbol, fc_weight, fc_bias, attention_fc_weight, attention_fc_bias, targ_em_weight,
+                        begin_state=None, layout='TNC', merge_outputs=None):
+    decoder.reset()
+    if begin_state is None:
+        begin_state = decoder.begin_state()
+    inputs, _ = _normalize_sequence(unroll_length, target_embed, layout, False)
+    # Need to use hidden state from attention model, but <GO> as input
+    states = begin_state
+    outputs = []
+
+    #At the first time step there is no previous attention
+    attention_state = None
+    dec_out = None
+
+    for i in range(unroll_length):
+        if args.input_feed:
+            # Copy previous attention output to concatenate with the embedding input
+            prev_attention_state = attention_state if attention_state else mx.sym.zeros_like(encoder_outputs[-1], name='train_dec_unroll_prev_attention_state')
+            decoder_feed = mx.sym.concat(inputs[i], prev_attention_state, name = 'decoder_feed_concat_%d_' % i)
+        else:
+            decoder_feed = inputs[i]
+
+        prev_dec_out = dec_out if dec_out else mx.sym.zeros_like(encoder_outputs[-1], name='train_dec_unroll_prev_dec_out') # begin_state
+        dec_out, states = decoder(decoder_feed, states)
+
+        if args.attention:
+            # The attention receives as input all the encoder outputs and the current decoder output and return the vector
+            # for this time step
+            attention_state = attention_step(i, encoder_outputs, prev_dec_out)
+            # The attention output is combined with the decoder output for computing the next word
+            concatenated = mx.sym.concat(dec_out, attention_state, name = 'train_decoder_concat_%d_' % i)
+            attention_fc = mx.sym.FullyConnected(
+                data=concatenated, weight=attention_fc_weight, bias=attention_fc_bias, num_hidden=args.num_hidden, name='attention_fc%d_' % i
+            )
+            curr_out = mx.sym.Activation(data = attention_fc, act_type='tanh', name = 'attention_tanh%d_' % i)
+        else:
+            # We avoid all the attention computation
+            curr_out = dec_out
+        outputs.append(curr_out)
+    outputs, _ = _normalize_sequence(unroll_length, outputs, layout, merge_outputs)
+    return outputs, states
+
+
+def infer_decoder_unroll(decoder, encoder_outputs, target_embed, targ_vocab, unroll_length,
+                  go_symbol, fc_weight, fc_bias, attention_fc_weight, attention_fc_bias, targ_em_weight,
+                  begin_state=None, layout='TNC', merge_outputs=None):
+    decoder.reset()
+    if begin_state is None:
+        begin_state = decoder.begin_state()
+    inputs, _ = _normalize_sequence(unroll_length, target_embed, layout, False)
+    # Need to use hidden state from attention model, but <GO> as input
+    states = begin_state
+    outputs = []
+    embed = inputs[0]
+
+    attention_state = None
+
+    for i in range(unroll_length):
+        if args.input_feed:
+            # Copy previous attention output to concatenate with the embedding input
+            prev_attention_state = attention_state if attention_state else mx.sym.zeros_like(encoder_outputs[-1],
+                                        name='train_dec_unroll_prev_attention_state')
+            decoder_feed = mx.sym.concat(embed, prev_attention_state, name='decoder_feed_concat_%d_' % i)
+        else:
+            decoder_feed = embed
+        dec_out, states = decoder(decoder_feed, states)
+
+        # Should this be dec_out or states as the first argument?
+        if args.attention:
+            attention_state = attention_step(i, encoder_outputs, dec_out)
+            concatenated = mx.sym.concat(dec_out, attention_state, name = 'train_decoder_concat_%d_' % i)
+            attention_fc = mx.sym.FullyConnected(
+                data=concatenated, weight=attention_fc_weight, bias=attention_fc_bias, num_hidden=args.num_hidden, name='attention_fc%d_' % i
+            )
+            curr_out = mx.sym.Activation(data = attention_fc, act_type='tanh', name = 'attention_tanh%d_' % i)
+        else:
+            curr_out = dec_out
+        outputs.append(curr_out)
+        fc = mx.sym.FullyConnected(data=curr_out, weight=fc_weight, bias=fc_bias, num_hidden=len(targ_vocab), name='decoder_fc%d_'%i)
+        am = mx.sym.argmax(data=fc, axis=1)
+        embed = mx.sym.Embedding(data=am, weight=targ_em_weight, input_dim=len(targ_vocab), output_dim=args.num_embed, name='decoder_embed%d_'%i)
+
+    outputs, _ = _normalize_sequence(unroll_length, outputs, layout, merge_outputs)
+    return outputs, states
+
+def train(args):
+
+    from time import time
+
+    data_train, data_val, _, src_vocab, targ_vocab, inv_src_vocab, inv_targ_vocab = get_data('TN')
+    print "len(src_vocab) len(targ_vocab)", len(src_vocab), len(targ_vocab)
+
+    attention_fc_weight = mx.sym.Variable('attention_fc_weight')
+    attention_fc_bias = mx.sym.Variable('attention_fc_bias')
+
+    fc_weight = mx.sym.Variable('fc_weight')
+    fc_bias = mx.sym.Variable('fc_bias')
+    targ_em_weight = mx.sym.Variable('targ_embed_weight')
+
+    encoder = SequentialRNNCell()
+
+    if args.use_cudnn_cells:
+        encoder.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout,
+            mode='lstm', prefix='lstm_encoder', bidirectional=args.bidirectional, get_next_state=True))
+    else:
+        for i in range(args.num_layers):
+            if args.bidirectional:
+                encoder.add(
+                    BidirectionalCell(
+                        LSTMCell(args.num_hidden // 2, prefix='rnn_encoder_f%d_' % i),
+                        LSTMCell(args.num_hidden // 2, prefix='rnn_encoder_b%d_' % i)))
+                if i < args.num_layers - 1 and args.dropout > 0.0:
+                    encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i))
+            else:
+                encoder.add(
+                    LSTMCell(args.num_hidden, prefix='rnn_encoder%d_' % i))
+                if i < args.num_layers - 1 and args.dropout > 0.0:
+                    encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i))
+
+    decoder = mx.rnn.SequentialRNNCell()
+
+    if args.use_cudnn_cells:
+        decoder.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, 
+            mode='lstm', prefix='lstm_decoder', bidirectional=args.bidirectional, get_next_state=True))
+    else:
+        for i in range(args.num_layers):
+            decoder.add(LSTMCell(args.num_hidden, prefix=('rnn_decoder%d_' % i)))
+            if i < args.num_layers - 1 and args.dropout > 0.0:
+                decoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_decoder%d_' % i))
+
+    def sym_gen(seq_len):
+        src_data = mx.sym.Variable('src_data')
+        targ_data = mx.sym.Variable('targ_data')
+        label = mx.sym.Variable('softmax_label')
+ 
+        src_embed = mx.sym.Embedding(data=src_data, input_dim=len(src_vocab), 
+                                 output_dim=args.num_embed, name='src_embed') 
+        targ_embed = mx.sym.Embedding(data=targ_data, weight=targ_em_weight, input_dim=len(targ_vocab),    # data=data
+                                 output_dim=args.num_embed, name='targ_embed')
+
+        encoder.reset()
+        decoder.reset()
+
+        enc_seq_len, dec_seq_len = seq_len
+
+        layout = 'TNC'
+        encoder_outputs, encoder_states = encoder.unroll(enc_seq_len, inputs=src_embed, layout=layout)
+
+        if args.bidirectional:
+            encoder_states = [mx.sym.concat(encoder_states[0][0], encoder_states[0][1]),
+                              mx.sym.concat(encoder_states[0][1], encoder_states[1][1])]
+
+        if args.remove_state_feed:
+            encoder_states = None
+
+        # This should be based on EOS or max seq len for inference, but here we unroll to the target length
+        # TODO: fix <GO> symbol
+        if args.inference_unrolling_for_training:
+            outputs, _ = infer_decoder_unroll(decoder, encoder_outputs, targ_embed, targ_vocab, dec_seq_len, 0, fc_weight, fc_bias,
+                             attention_fc_weight, attention_fc_bias, 
+                             targ_em_weight, begin_state=encoder_states, layout='TNC', merge_outputs=True)
+        else:
+            outputs, _ = train_decoder_unroll(decoder, encoder_outputs, targ_embed, targ_vocab, dec_seq_len, 0, fc_weight, fc_bias,
+                             attention_fc_weight, attention_fc_bias, 
+                             targ_em_weight, begin_state=encoder_states, layout='TNC', merge_outputs=True)
+
+        # NEW
+        rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1')
+        fc = mx.sym.FullyConnected(data=rs, weight=fc_weight, bias=fc_bias, num_hidden=len(targ_vocab), name='sym_gen_fc')
+        label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2')
+        pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax')
+
+        return pred, ('src_data', 'targ_data',), ('softmax_label',)
+
+
+#    foo, _, _ = sym_gen((1, 1))
+#    print(type(foo))
+#    mx.viz.plot_network(symbol=foo).save('./seq2seq.dot')
+
+
+    if args.gpus:
+        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    else:
+        contexts = mx.cpu(0)
+
+    model = mx.mod.BucketingModule( 
+        sym_gen             = sym_gen,
+        default_bucket_key  = data_train.default_bucket_key,
+        context             = contexts)
+
+    if args.load_epoch:
+        _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(
+            [encoder, decoder], args.model_prefix, args.load_epoch)
+    else:
+        arg_params = None
+        aux_params = None
+
+    opt_params = {
+      'learning_rate': args.lr,
+      'wd': args.wd
+    }
+
+    if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']:
+        opt_params['momentum'] = args.mom
+
+    opt_params['clip_gradient'] = args.max_grad_norm
+
+    start = time()
+
+    model.fit(
+        train_data          = data_train,
+        eval_data           = data_val,
+        eval_metric         = mx.metric.Perplexity(invalid_label),
+        kvstore             = args.kv_store,
+        optimizer           = args.optimizer,
+        optimizer_params    = opt_params, 
+        initializer         = mx.init.Xavier(factor_type="in", magnitude=2.34),
+        arg_params          = arg_params,
+        aux_params          = aux_params,
+        begin_epoch         = args.load_epoch,
+        num_epoch           = args.num_epochs,
+        batch_end_callback  = mx.callback.Speedometer(batch_size=args.batch_size, frequent=args.disp_batches, auto_reset=True),
+        epoch_end_callback  = mx.rnn.do_rnn_checkpoint([encoder, decoder], args.model_prefix, 1)
+                              if args.model_prefix else None)
+
+    train_duration = time() - start
+    time_per_epoch = train_duration / args.num_epochs
+    print("\n\nTime per epoch: %.2f seconds\n\n" % time_per_epoch)
+
+
+def drop_sentinels(text_lst):
+    sentinels = lambda x: x == reserved_tokens['<PAD>'] or x == reserved_tokens['<GO>']
+    text_lst = dropwhile(lambda x: sentinels(x), text_lst)
+    text_lst = takewhile(lambda x: not sentinels(x) and x != reserved_tokens['<EOS>'], text_lst)
+    return list(text_lst)
+
+
+class BleuScore(mx.metric.EvalMetric):
+    def __init__(self, ignore_label, axis=-1):
+        super(BleuScore, self).__init__('BleuScore')
+        self.ignore_label = ignore_label
+        self.axis = axis
+
+    def update(self, labels, preds):
+        assert len(labels) == len(preds)
+
+        smoothing_fn = nltk.translate.bleu_score.SmoothingFunction().method3
+
+        for label, pred in zip(labels, preds):
+            maxed = mx.ndarray.argmax(data=pred, axis=1)
+            pred_nparr = maxed.asnumpy()
+            label_nparr = label.asnumpy().astype(np.int32) 
+            sent_len, batch_size = np.shape(label_nparr)
+            pred_nparr = pred_nparr.reshape(sent_len, batch_size).astype(np.int32)
+
+            for i in range(batch_size):
+                exp_lst = drop_sentinels(label_nparr[:, i].tolist())
+                act_lst = drop_sentinels(pred_nparr[:, i].tolist())
+                expected = exp_lst
+                actual = act_lst
+                bleu = nltk.translate.bleu_score.sentence_bleu(
+                    references=[expected], hypothesis=actual, weights=(0.25, 0.25, 0.25, 0.25),
+                    smoothing_function = smoothing_fn 
+                )
+#                print("bleu: %f" % bleu)
+                self.sum_metric += bleu
+                self.num_inst += 1
+            assert label.size == pred.size/pred.shape[-1], \
+                "shape mismatch: %s vs. %s"%(label.shape, pred.shape)
+
+    def get(self):
+        num = self.num_inst if self.num_inst > 0 else float('nan')
+        return (self.name, self.sum_metric/num)
+
+
+def infer(args):
+    assert args.model_prefix, "Must specifiy path to load from"
+
+    data_test, src_vocab, inv_src_vocab, targ_vocab, inv_targ_vocab = get_data('TN', infer=True)
+
+    print "len(src_vocab) len(targ_vocab)", len(src_vocab), len(targ_vocab)
+
+    attention_fc_weight = mx.sym.Variable('attention_fc_weight')
+    attention_fc_bias = mx.sym.Variable('attention_fc_bias')
+
+    fc_weight = mx.sym.Variable('fc_weight')
+    fc_bias = mx.sym.Variable('fc_bias')
+    targ_em_weight = mx.sym.Variable('targ_embed_weight')
+
+    if args.use_cudnn_cells:
+        encoder = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout,
+            mode='lstm', prefix='lstm_encoder', bidirectional=args.bidirectional, get_next_state=True).unfuse()
+
+    else:
+        encoder = SequentialRNNCell()
+
+        for i in range(args.num_layers):
+            if args.bidirectional:
+                encoder.add(
+                    BidirectionalCell(
+                        LSTMCell(args.num_hidden // 2, prefix='rnn_encoder_f%d_' % i),
+                        LSTMCell(args.num_hidden // 2, prefix='rnn_encoder_b%d_' % i)))
+                if i < args.num_layers - 1 and args.dropout > 0.0:
+                    encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i))
+            else:
+                encoder.add(
+                    LSTMCell(args.num_hidden, prefix='rnn_encoder%d_' % i))
+                if i < args.num_layers - 1 and args.dropout > 0.0:
+                    encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i))
+
+    if args.use_cudnn_cells:
+        decoder = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, 
+            mode='lstm', prefix='lstm_decoder', bidirectional=args.bidirectional, get_next_state=True).unfuse()
+ 
+    else:
+        decoder = mx.rnn.SequentialRNNCell()
+
+        for i in range(args.num_layers):
+            decoder.add(LSTMCell(args.num_hidden, prefix=('rnn_decoder%d_' % i)))
+            if i < args.num_layers - 1 and args.dropout > 0.0:
+                decoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_decoder%d_' % i))
+
+    def sym_gen(seq_len):
+        src_data = mx.sym.Variable('src_data')
+        targ_data = mx.sym.Variable('targ_data')
+        label = mx.sym.Variable('softmax_label')
+ 
+        src_embed = mx.sym.Embedding(data=src_data, input_dim=len(src_vocab), 
+                                 output_dim=args.num_embed, name='src_embed') 
+        targ_embed = mx.sym.Embedding(data=targ_data, input_dim=len(targ_vocab),
+                                 weight = targ_em_weight,    # data=data
+                                 output_dim=args.num_embed, name='targ_embed')
+
+        encoder.reset()
+        decoder.reset()
+
+        enc_seq_len, dec_seq_len = seq_len
+
+        layout = 'TNC'
+        encoder_outputs, encoder_states = encoder.unroll(enc_seq_len, inputs=src_embed, layout=layout)
+
+        if args.bidirectional:
+            encoder_states = [mx.sym.concat(encoder_states[0][0], encoder_states[0][1]),
+                              mx.sym.concat(encoder_states[0][1], encoder_states[1][1])]
+
+        # This should be based on EOS or max seq len for inference, but here we unroll to the target length
+        # TODO: fix <GO> symbol
+#        outputs, _ = decoder.unroll(dec_seq_len, targ_embed, begin_state=states, layout=layout, merge_outputs=True)
+        outputs, _ = infer_decoder_unroll(decoder, encoder_outputs, targ_embed, targ_vocab, dec_seq_len, 0,
+                     fc_weight, fc_bias, 
+                     attention_fc_weight, attention_fc_bias,
+                     targ_em_weight,
+                     begin_state=encoder_states, layout='TNC', merge_outputs=True)
+
+        # NEW
+
+        rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1')
+        fc = mx.sym.FullyConnected(data=rs, weight=fc_weight, bias=fc_bias, num_hidden=len(targ_vocab), name='sym_gen_fc')
+        label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2')
+        pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax')
+
+#        rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1')
+#        fc = mx.sym.FullyConnected(data=rs, num_hidden=len(targ_vocab), name='sym_gen_fc')
+#        label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2')
+#        pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax')
+
+        return pred, ('src_data', 'targ_data',), ('softmax_label',)
+
+    if args.gpus:
+        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    else:
+        contexts = mx.cpu(0)
+
+    model = mx.mod.BucketingModule( 
+        sym_gen             = sym_gen,
+        default_bucket_key  = data_test.default_bucket_key,
+        context             = contexts)
+
+    model.bind(data_test.provide_data, data_test.provide_label, for_training=False)
+
+    if args.load_epoch:
+        _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(
+            [encoder, decoder], args.model_prefix, args.load_epoch)
+#        print(arg_params)
+        model.set_params(arg_params, aux_params)
+
+    else:
+        arg_params = None
+        aux_params = None
+
+
+    opt_params = {
+      'learning_rate': args.lr,
+      'wd': args.wd
+    }
+
+    if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']:
+        opt_params['momentum'] = args.mom
+
+    opt_params['clip_gradient'] = args.max_grad_norm
+
+    start = time()
+
+    # mx.metric.Perplexity
+#    model.score(data_test, BleuScore(invalid_label), #mx.metric.Perplexity(invalid_label),
+#                batch_end_callback=mx.callback.Speedometer(batch_size=args.batch_size, frequent=1, auto_reset=True))
+
+    examples = []
+    bleu_acc = 0.0
+    num_inst = 0
+
+    try:
+        data_test.reset()
+
+        smoothing_fn = nltk.translate.bleu_score.SmoothingFunction().method3
+
+        while True:
+
+            data_batch = data_test.next()
+            model.forward(data_batch, is_train=None)
+            source = data_batch.data[0]
+            preds = model.get_outputs()[0]
+            labels = data_batch.label[0]
+
+            maxed = mx.ndarray.argmax(data=preds, axis=1)
+            pred_nparr = maxed.asnumpy()
+            src_nparr = source.asnumpy()
+            label_nparr = labels.asnumpy().astype(np.int32)
+            sent_len, batch_size = np.shape(label_nparr)
+            pred_nparr = pred_nparr.reshape(sent_len, batch_size).astype(np.int32)
+
+            for i in range(batch_size):
+
+                src_lst = list(reversed(drop_sentinels(src_nparr[:, i].tolist())))
+                exp_lst = drop_sentinels(label_nparr[:, i].tolist())
+                act_lst = drop_sentinels(pred_nparr[:, i].tolist())
+
+                expected = exp_lst
+                actual = act_lst
+                bleu = nltk.translate.bleu_score.sentence_bleu(
+                    references=[expected], hypothesis=actual, weights=(0.25, 0.25, 0.25, 0.25),
+                    smoothing_function = smoothing_fn 
+                )
+                bleu_acc += bleu
+                num_inst += 1
+                examples.append((src_lst, exp_lst, act_lst, bleu))
+
+    except StopIteration as se:
+        pass
+    
+    bleu_acc /= num_inst
+
+    # Find the top K best translations
+    examples = sorted(examples, key=itemgetter(3), reverse=True) 
+
+    num_examples = 20
+
+    print("\nSample translations:\n")
+    for i in range(min(num_examples, len(examples))):
+        src_lst, exp_lst, act_lst, bleu = examples[i]
+        src_txt = array_to_text(src_lst, data_test.inv_src_vocab)
+        exp_txt = array_to_text(exp_lst, data_test.inv_targ_vocab) 
+        act_txt = array_to_text(act_lst, data_test.inv_targ_vocab) 
+        print("\n")
+        print("Source text: %s" % src_txt)
+        print("Expected translation: %s" % exp_txt)
+        print("Actual translation: %s" % act_txt)
+    print("\nTest set BLEU score (averaged over all examples): %.3f\n" % bleu_acc)
+
+if __name__ == '__main__':
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+
+    args = parser.parse_args()
+
+    if args.input_feed:
+        assert (args.attention == True), "--input-feed is legal only with --attention!"
+
+    # set random seeds for Python, NumPy and MxNet
+    import random
+    seed = args.seed
+    np.random.seed(seed)
+    random.seed(seed)
+    mx.random.seed(seed)
+    print("Using seed: %d" % seed)
+
+    if args.gpus:
+        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    else:
+        contexts = mx.cpu(0)
+   
+    print("\n") 
+
+    if args.num_layers >= 4 and len(args.gpus.split(',')) >= 4 and not args.stack_rnn:
+        print('WARNING: stack-rnn is recommended to train complex model on multiple GPUs')
+
+    if args.infer:
+        # Demonstrates how to load a model trained with CuDNN RNN and predict
+        # with non-fused MXNet symbol
+        infer(args)
+    else:
+        if args.inference_unrolling_for_training:
+            print("INFO: Using inference decoder unrolling for training")
+        else:
+            print("INFO: Using regular decoder unrolling for training")
+        train(args)
diff --git a/seq2seq/seq2seq_iterator.py b/seq2seq_iterator.py
similarity index 98%
rename from seq2seq/seq2seq_iterator.py
rename to seq2seq_iterator.py
index 3fd096b..d851f32 100644
--- a/seq2seq/seq2seq_iterator.py
+++ b/seq2seq_iterator.py
@@ -102,17 +102,20 @@ def __init__(
 #        else:
 #            raise ValueError("Invalid layout %s: Must by NT (batch major) or TN (time major)")
 
-    def initialize(self):
+    def initialize(self, curr_batch_size=None):
+        if curr_batch_size:
+            self.batch_size = curr_batch_size
+            self.default_bucket_key = (self.default_bucket_key[0]+1, self.default_bucket_key[1]+1)
         if self.layout == 'TN':
             self.provide_data = [
                 mx.io.DataDesc(self.src_data_name, (self.default_bucket_key[0], self.batch_size), layout='TN'),
-                mx.io.DataDesc(self.targ_data_name, (self.default_bucket_key[0], self.batch_size), layout='TN')
+                mx.io.DataDesc(self.targ_data_name, (self.default_bucket_key[1], self.batch_size), layout='TN')
             ]
             self.provide_label = [mx.io.DataDesc(self.label_name, (self.default_bucket_key[1], self.batch_size), layout='TN')]
         elif self.layout == 'NT':
             self.provide_data = [
                 (self.src_data_name, (self.batch_size, self.default_bucket_key[0])),
-                (self.targ_data_name, (self.batch_size, self.default_bucket_key[0]))]
+                (self.targ_data_name, (self.batch_size, self.default_bucket_key[1]))]
             self.provide_label = [(self.label_name, (self.batch_size, self.default_bucket_key[1]))]
         else:
             raise ValueError("Invalid layout %s: Must by NT (batch major) or TN (time major)")
diff --git a/speedometer_reset.patch b/speedometer_reset.patch
deleted file mode 100644
index f284fd2..0000000
--- a/speedometer_reset.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
-index 396f5a1..544eab2 100644
---- a/python/mxnet/callback.py
-+++ b/python/mxnet/callback.py
-@@ -96,13 +96,16 @@ class Speedometer(object):
-     frequent: int
-         How many batches between calculations.
-         Defaults to calculating & logging every 50 batches.
-+    auto_reset : bool
-+        Reset the metric after each log.
-     """
--    def __init__(self, batch_size, frequent=50):
-+    def __init__(self, batch_size, frequent=50, auto_reset=False):
-         self.batch_size = batch_size
-         self.frequent = frequent
-         self.init = False
-         self.tic = 0
-         self.last_count = 0
-+        self.auto_reset = auto_reset
- 
-     def __call__(self, param):
-         """Callback to Show speed."""
-@@ -116,7 +119,8 @@ class Speedometer(object):
-                 speed = self.frequent * self.batch_size / (time.time() - self.tic)
-                 if param.eval_metric is not None:
-                     name_value = param.eval_metric.get_name_value()
--                    param.eval_metric.reset()
-+                    if self.auto_reset:
-+                        param.eval_metric.reset()
-                     for name, value in name_value:
-                         logging.info('Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f',
-                                      param.epoch, count, speed, name, value)
diff --git a/seq2seq/split_train_valid.py b/split_train_valid.py
similarity index 100%
rename from seq2seq/split_train_valid.py
rename to split_train_valid.py
diff --git a/start_container.sh b/start_container.sh
deleted file mode 100755
index 685abfc..0000000
--- a/start_container.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-nvidia-docker run --rm -it -v `pwd`:/mxnet_seq2seq -p 8888:8888 mxnet_seq2seq
diff --git a/train_ptb/README.md b/train_ptb/README.md
deleted file mode 100644
index 8a6f29d..0000000
--- a/train_ptb/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-RNN Example
-===========
-This folder contains RNN examples using high level mxnet.rnn interface.
-
-Examples using low level symbol interface have been deprecated and moved to old/
-
-## Data
-Run `get_ptb_data.sh` to download PenTreeBank data.
-
-## Python
-
-- [lstm_bucketing.py](lstm_bucketing.py) PennTreeBank language model by using LSTM
-
-Performance Note:
-More ```MXNET_GPU_WORKER_NTHREADS``` may lead to better performance. For setting ```MXNET_GPU_WORKER_NTHREADS```, please refer to [Environment Variables](https://mxnet.readthedocs.org/en/latest/how_to/env_var.html).
diff --git a/train_ptb/cudnn_lstm_bucketing.py b/train_ptb/cudnn_lstm_bucketing.py
deleted file mode 100644
index 8e0ad9d..0000000
--- a/train_ptb/cudnn_lstm_bucketing.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import numpy as np
-import mxnet as mx
-import argparse
-
-parser = argparse.ArgumentParser(description="Train RNN on Penn Tree Bank",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--test', default=False, action='store_true',
-                    help='whether to do testing instead of training')
-parser.add_argument('--model-prefix', type=str, default=None,
-                    help='path to save/load model')
-parser.add_argument('--load-epoch', type=int, default=0,
-                    help='load from epoch')
-parser.add_argument('--num-layers', type=int, default=2,
-                    help='number of stacked RNN layers')
-parser.add_argument('--num-hidden', type=int, default=200,
-                    help='hidden layer size')
-parser.add_argument('--num-embed', type=int, default=200,
-                    help='embedding layer size')
-parser.add_argument('--bidirectional', type=bool, default=False,
-                    help='whether to use bidirectional layers')
-parser.add_argument('--gpus', type=str,
-                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \
-                         'Increase batch size when using multiple gpus for best performance.')
-parser.add_argument('--kv-store', type=str, default='device',
-                    help='key-value store type')
-parser.add_argument('--num-epochs', type=int, default=25,
-                    help='max num of epochs')
-parser.add_argument('--lr', type=float, default=0.01,
-                    help='initial learning rate')
-parser.add_argument('--optimizer', type=str, default='sgd',
-                    help='the optimizer type')
-parser.add_argument('--mom', type=float, default=0.0,
-                    help='momentum for sgd')
-parser.add_argument('--wd', type=float, default=0.00001,
-                    help='weight decay for sgd')
-parser.add_argument('--batch-size', type=int, default=32,
-                    help='the batch size.')
-parser.add_argument('--disp-batches', type=int, default=50,
-                    help='show progress for every n batches')
-# When training a deep, complex model, it's recommended to stack fused RNN cells (one
-# layer per cell) together instead of one with all layers. The reason is that fused RNN
-# cells doesn't set gradients to be ready until the computation for the entire layer is
-# completed. Breaking a multi-layer fused RNN cell into several one-layer ones allows
-# gradients to be processed ealier. This reduces communication overhead, especially with
-# multiple GPUs.
-parser.add_argument('--stack-rnn', default=False,
-                    help='stack fused RNN cells to reduce communication overhead')
-parser.add_argument('--dropout', type=float, default='0.0',
-                    help='dropout keep probability')
-
-#buckets = [32]
-buckets = [10, 20, 30, 40, 50, 60]
-
-start_label = 1
-invalid_label = 0
-
-def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
-    lines = open(fname).readlines()
-    lines = [filter(None, i.split(' ')) for i in lines]
-    sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label, start_label=start_label)
-    return sentences, vocab
-
-def get_data(layout):
-    train_sent, vocab = tokenize_text("./data/ptb.train.txt", start_label=start_label,
-                                      invalid_label=invalid_label)
-    val_sent, _ = tokenize_text("./data/ptb.test.txt", vocab=vocab, start_label=start_label,
-                                invalid_label=invalid_label)
-
-    data_train  = mx.rnn.BucketSentenceIter(train_sent, args.batch_size, buckets=buckets,
-                                            invalid_label=invalid_label, layout=layout)
-    data_val    = mx.rnn.BucketSentenceIter(val_sent, args.batch_size, buckets=buckets,
-                                            invalid_label=invalid_label, layout=layout)
-    return data_train, data_val, vocab
-
-
-def train(args):
-    data_train, data_val, vocab = get_data('TN')
-    if args.stack_rnn:
-        stack = mx.rnn.SequentialRNNCell()
-        for layer in range(args.num_layers):
-            dropout = 0.0
-            if layer < (args.num_layers - 1):
-                dropout = args.dropout
-            stack.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=1,
-                    mode='lstm', prefix='lstm_%d'%layer, dropout=dropout,
-                    bidirectional=args.bidirectional))
-        cell = stack
-    else:
-        cell = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout,
-                mode='lstm', bidirectional=args.bidirectional)
-
-    def sym_gen(seq_len):
-        data = mx.sym.Variable('data')
-        label = mx.sym.Variable('softmax_label')
-        embed = mx.sym.Embedding(data=data, input_dim=len(vocab), output_dim=args.num_embed,name='embed')
-
-        output, _ = cell.unroll(seq_len, inputs=embed, merge_outputs=True, layout='TNC')
-
-        pred = mx.sym.Reshape(output,
-                shape=(-1, args.num_hidden*(1+args.bidirectional)))
-        pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred')
-
-        label = mx.sym.Reshape(label, shape=(-1,))
-        pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax')
-
-        return pred, ('data',), ('softmax_label',)
-
-    if args.gpus:
-        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
-    else:
-        contexts = mx.cpu(0)
-
-    model = mx.mod.BucketingModule(
-        sym_gen             = sym_gen,
-        default_bucket_key  = data_train.default_bucket_key,
-        context             = contexts)
-
-    if args.load_epoch:
-        _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(
-            cell, args.model_prefix, args.load_epoch)
-    else:
-        arg_params = None
-        aux_params = None
-
-    opt_params = {
-      'learning_rate': args.lr,
-      'wd': args.wd
-    }
-
-    if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']:
-        opt_params['momentum'] = args.mom
-
-    model.fit(
-        train_data          = data_train,
-        eval_data           = data_val,
-        eval_metric         = mx.metric.Perplexity(invalid_label),
-        kvstore             = args.kv_store,
-        optimizer           = args.optimizer,
-        optimizer_params    = opt_params, 
-        initializer         = mx.init.Xavier(factor_type="in", magnitude=2.34),
-        arg_params          = arg_params,
-        aux_params          = aux_params,
-        begin_epoch         = args.load_epoch,
-        num_epoch           = args.num_epochs,
-        batch_end_callback  = mx.callback.Speedometer(args.batch_size, args.disp_batches),
-        epoch_end_callback  = mx.rnn.do_rnn_checkpoint(cell, args.model_prefix, 1)
-                              if args.model_prefix else None)
-
-def test(args):
-    assert args.model_prefix, "Must specifiy path to load from"
-    _, data_val, vocab = get_data('NT')
-
-    if not args.stack_rnn:
-        stack = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers,
-                mode='lstm', bidirectional=args.bidirectional).unfuse()
-    else:
-        stack = mx.rnn.SequentialRNNCell()
-        for i in range(args.num_layers):
-            cell = mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_%dl0_'%i)
-            if args.bidirectional:
-                cell = mx.rnn.BidirectionalCell(
-                        cell,
-                        mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_%dr0_'%i),
-                        output_prefix='bi_lstm_%d'%i)
-            stack.add(cell)
-
-    def sym_gen(seq_len):
-        data = mx.sym.Variable('data')
-        label = mx.sym.Variable('softmax_label')
-        embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
-                                 output_dim=args.num_embed, name='embed')
-
-        stack.reset()
-        outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True)
-
-        pred = mx.sym.Reshape(outputs,
-                shape=(-1, args.num_hidden*(1+args.bidirectional)))
-        pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred')
-
-        label = mx.sym.Reshape(label, shape=(-1,))
-        pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax')
-
-        return pred, ('data',), ('softmax_label',)
-
-    if args.gpus:
-        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
-    else:
-        contexts = mx.cpu(0)
-
-    model = mx.mod.BucketingModule(
-        sym_gen             = sym_gen,
-        default_bucket_key  = data_val.default_bucket_key,
-        context             = contexts)
-    model.bind(data_val.provide_data, data_val.provide_label, for_training=False)
-
-    # note here we load using SequentialRNNCell instead of FusedRNNCell.
-    _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(stack, args.model_prefix, args.load_epoch)
-    model.set_params(arg_params, aux_params)
-
-    model.score(data_val, mx.metric.Perplexity(invalid_label),
-                batch_end_callback=mx.callback.Speedometer(args.batch_size, 5))
-
-if __name__ == '__main__':
-    import logging
-    head = '%(asctime)-15s %(message)s'
-    logging.basicConfig(level=logging.DEBUG, format=head)
-
-    args = parser.parse_args()
-
-    if args.num_layers >= 4 and len(args.gpus.split(',')) >= 4 and not args.stack_rnn:
-        print('WARNING: stack-rnn is recommended to train complex model on multiple GPUs')
-
-    if args.test:
-        # Demonstrates how to load a model trained with CuDNN RNN and predict
-        # with non-fused MXNet symbol
-        test(args)
-    else:
-        train(args)
diff --git a/train_ptb/get_ptb_data.sh b/train_ptb/get_ptb_data.sh
deleted file mode 100755
index 1ec009a..0000000
--- a/train_ptb/get_ptb_data.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-RNN_DIR=$(cd `dirname $0`; pwd)
-DATA_DIR="${RNN_DIR}/data/"
-
-if [[ ! -d "${DATA_DIR}" ]]; then
-  echo "${DATA_DIR} doesn't exist, will create one";
-  mkdir -p ${DATA_DIR}
-fi
-
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/train_ptb/lstm_bucketing.py b/train_ptb/lstm_bucketing.py
deleted file mode 100644
index 4bc934a..0000000
--- a/train_ptb/lstm_bucketing.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import numpy as np
-import mxnet as mx
-import argparse
-
-parser = argparse.ArgumentParser(description="Train RNN on Penn Tree Bank",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--num-layers', type=int, default=2,
-                    help='number of stacked RNN layers')
-parser.add_argument('--num-hidden', type=int, default=200,
-                    help='hidden layer size')
-parser.add_argument('--num-embed', type=int, default=200,
-                    help='embedding layer size')
-parser.add_argument('--gpus', type=str,
-                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \
-                         'Increase batch size when using multiple gpus for best performance.')
-parser.add_argument('--kv-store', type=str, default='device',
-                    help='key-value store type')
-parser.add_argument('--num-epochs', type=int, default=25,
-                    help='max num of epochs')
-parser.add_argument('--lr', type=float, default=0.01,
-                    help='initial learning rate')
-parser.add_argument('--optimizer', type=str, default='sgd',
-                    help='the optimizer type')
-parser.add_argument('--mom', type=float, default=0.0,
-                    help='momentum for sgd')
-parser.add_argument('--wd', type=float, default=0.00001,
-                    help='weight decay for sgd')
-parser.add_argument('--batch-size', type=int, default=32,
-                    help='the batch size.')
-parser.add_argument('--disp-batches', type=int, default=50,
-                    help='show progress for every n batches')
-
-
-def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
-    lines = open(fname).readlines()
-    lines = [filter(None, i.split(' ')) for i in lines]
-    sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label,
-                                               start_label=start_label)
-    return sentences, vocab
-
-
-if __name__ == '__main__':
-    import logging
-    head = '%(asctime)-15s %(message)s'
-    logging.basicConfig(level=logging.DEBUG, format=head)
-
-    args = parser.parse_args()
-
-    #buckets = []
-    buckets = [10, 20, 30, 40, 50, 60]
-
-    start_label = 1
-    invalid_label = 0
-
-    train_sent, vocab = tokenize_text("./data/ptb.train.txt", start_label=start_label,
-                                      invalid_label=invalid_label)
-    val_sent, _ = tokenize_text("./data/ptb.test.txt", vocab=vocab, start_label=start_label,
-                                invalid_label=invalid_label)
-
-    data_train  = mx.rnn.BucketSentenceIter(train_sent, args.batch_size, buckets=buckets,
-                                            invalid_label=invalid_label)
-    data_val    = mx.rnn.BucketSentenceIter(val_sent, args.batch_size, buckets=buckets,
-                                            invalid_label=invalid_label)
-
-    stack = mx.rnn.SequentialRNNCell()
-    for i in range(args.num_layers):
-        stack.add(mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_l%d_'%i))
-
-    def sym_gen(seq_len):
-        data = mx.sym.Variable('data')
-        label = mx.sym.Variable('softmax_label')
-        embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
-                                 output_dim=args.num_embed, name='embed')
-
-        stack.reset()
-        outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True)
-
-        pred = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden))
-        pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred')
-
-        label = mx.sym.Reshape(label, shape=(-1,))
-        pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax')
-
-        return pred, ('data',), ('softmax_label',)
-
-    if args.gpus:
-        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
-    else:
-        contexts = mx.cpu(0)
-
-    model = mx.mod.BucketingModule(
-        sym_gen             = sym_gen,
-        default_bucket_key  = data_train.default_bucket_key,
-        context             = contexts)
-
-    model.fit(
-        train_data          = data_train,
-        eval_data           = data_val,
-        eval_metric         = mx.metric.Perplexity(invalid_label),
-        kvstore             = args.kv_store,
-        optimizer           = args.optimizer,
-        optimizer_params    = { 'learning_rate': args.lr,
-                                'momentum': args.mom,
-                                'wd': args.wd },
-        initializer         = mx.init.Xavier(factor_type="in", magnitude=2.34),
-        num_epoch           = args.num_epochs,
-        batch_end_callback  = mx.callback.Speedometer(args.batch_size, args.disp_batches))
diff --git a/seq2seq/utils.py b/utils.py
similarity index 83%
rename from seq2seq/utils.py
rename to utils.py
index 237547d..28be52f 100644
--- a/seq2seq/utils.py
+++ b/utils.py
@@ -15,8 +15,8 @@
 
 Dataset = namedtuple(
     'Dataset',
-    ['src_train_sent', 'src_valid_sent', 'src_vocab', 'inv_src_vocab', 
-     'targ_train_sent', 'targ_valid_sent', 'targ_vocab', 'inv_targ_vocab'])
+    ['src_train_sent', 'src_valid_sent', 'src_test_sent', 'src_vocab', 'inv_src_vocab', 
+     'targ_train_sent', 'targ_valid_sent', 'targ_test_sent', 'targ_vocab', 'inv_targ_vocab'])
 
 def invert_dict(d):
     return {v: k for k, v in d.iteritems()}
@@ -91,7 +91,8 @@ def array_to_text(array, inv_vocab):
         sent.append(inv_vocab[token])
     return " ".join(sent)
 
-def get_s2s_data(src_train_path, src_valid_path, targ_train_path, targ_valid_path,
+def get_s2s_data(src_train_path, src_valid_path, src_test_path, targ_train_path,
+    targ_valid_path, targ_test_path,
     reserved_tokens=['<UNK>', '<PAD>', '<EOS>', '<GO>']):
 
     print("Creating joint source dictionary")
@@ -99,8 +100,10 @@ def get_s2s_data(src_train_path, src_valid_path, targ_train_path, targ_valid_pat
        
     print("Tokenizing src_train_path") 
     src_train_sent = tokenize_text(src_train_path, vocab=src_dict)
-    print("Tokenizing targ_train_path")
+    print("Tokenizing src_valid_path")
     src_valid_sent = tokenize_text(src_valid_path, vocab=src_dict)
+    print("Tokenizing src_test_path")
+    src_test_sent = tokenize_text(src_test_path, vocab=src_dict)
 
     print("Creating joint target dictionary")
     targ_dict, inv_targ_dict = top_words_train_valid(targ_train_path, targ_valid_path)
@@ -109,6 +112,8 @@ def get_s2s_data(src_train_path, src_valid_path, targ_train_path, targ_valid_pat
     targ_train_sent = tokenize_text(targ_train_path, vocab=targ_dict)
     print("Tokenizing targ_valid_path")
     targ_valid_sent = tokenize_text(targ_valid_path, vocab=targ_dict)
+    print("Tokenizing targ_test_path")
+    targ_test_sent = tokenize_text(targ_test_path, vocab=targ_dict)
 
     print("\nEncoded source language sentences:\n")
     for i in range(5):
@@ -120,6 +125,7 @@ def get_s2s_data(src_train_path, src_valid_path, targ_train_path, targ_valid_pat
 
 
     return Dataset(
-        src_train_sent=src_train_sent, src_valid_sent=src_valid_sent, src_vocab=src_dict, inv_src_vocab=inv_src_dict,
-        targ_train_sent=targ_train_sent, targ_valid_sent=targ_valid_sent, targ_vocab=targ_dict, inv_targ_vocab=inv_targ_dict)
+        src_train_sent=src_train_sent, src_valid_sent=src_valid_sent, src_test_sent=src_test_sent,
+        src_vocab=src_dict, inv_src_vocab=inv_src_dict, targ_train_sent=targ_train_sent,
+        targ_valid_sent=targ_valid_sent, targ_test_sent=targ_test_sent, targ_vocab=targ_dict, inv_targ_vocab=inv_targ_dict)