diff --git a/Dockerfile b/Dockerfile deleted file mode 100755 index 0624d87..0000000 --- a/Dockerfile +++ /dev/null @@ -1,76 +0,0 @@ -FROM nvidia/cuda:8.0-cudnn5-devel - -RUN apt-get update && apt-get -y upgrade && \ - apt-get install -y \ - build-essential \ - ca-certificates \ - git \ - libopenblas-dev \ - libatlas-base-dev \ - libglib2.0-dev \ - libopencv-dev \ - python-dev \ - python-numpy \ - python-setuptools \ - wget \ - cmake \ - curl \ - python-pip \ - python-dev \ - unzip \ - sudo \ - vim \ - libglib2.0-dev \ - libtiff5-dev \ - libjpeg8-dev \ - zlib1g-dev - -RUN pip install --upgrade numpy scipy matplotlib scikit-learn sympy nltk setuptools requests - -COPY nan.patch /root - -# Build MxNet for Python -RUN cd /root && git clone --recursive https://github.com/dmlc/mxnet.git && cp nan.patch /root/mxnet/ && \ - cd mxnet && git checkout 955f6be6977ca1a27d3e912fd62a08f019dd1f76 && git apply nan.patch && \ - cp make/config.mk . && \ - echo "USE_CUDA=1" >> config.mk && \ - echo "USE_CUDNN=1" >> config.mk && \ - echo "CUDA_ARCH :=" \ - "-gencode arch=compute_35,code=sm_35" \ - "-gencode arch=compute_52,code=sm_52" \ - "-gencode arch=compute_60,code=sm_60" \ - "-gencode arch=compute_61,code=sm_61" \ - "-gencode arch=compute_61,code=compute_61" >> config.mk && \ - echo "USE_CUDA_PATH=/usr/local/cuda" >> config.mk - -ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/lib -RUN cd /root/mxnet && make -j$(nproc) && \ - mv lib/libmxnet.so /usr/local/lib && \ - ldconfig && \ - make clean && \ - cd python && \ - pip install -e . - -# Python3 support -RUN apt-get -y install python3-pip -RUN pip3 install numpy - -# Jupyter notebook support -COPY jupyter_notebook_config.py /root/.jupyter/jupyter_notebook_config.py -EXPOSE 8888 - -ENV PYTHONPATH /root/mxnet/python - -# Build MxNet for Scala -#RUN apt-get -y install maven openjdk-8-jdk scala -#RUN cd /root/mxnet && make scalapkg && make scalainstall - -# Build MxNet for R - WIP !!! -#RUN apt-get -y install r-base r-base-dev - -RUN pip install unidecode dill tqdm - -WORKDIR /root/mxnet - - - diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/seq2seq/README.md b/README.md similarity index 74% rename from seq2seq/README.md rename to README.md index 618e0bb..bbf30ab 100644 --- a/seq2seq/README.md +++ b/README.md @@ -1,11 +1,10 @@ -OpenNMT seq2seq model -========================== +LSTM encoder-decoder seq2seq model +================================== -This project is an implementation of the [OpenNMT sequence-to-sequence model](http://opennmt.net/Models/) in MxNet. The OpenNMT model is based on: +This project is an implementation of a simple encoder-decoder seq2seq model in MxNet. The OpenNMT model is based on: - a stacked LSTM encoder - a stacked LSTM decoder -- an attention model The reference model configuration is: @@ -38,8 +37,12 @@ How to run the scripts? Credits ------- -Thanks to Eric Xie (@piiswrong) for the attention cell implementation. +Many thanks to Eric Xie (@piiswrong), Sheng Zha (@szha) and Antti-Pekka Hynninen (@ap-hynninen) for valuable input. +License +------- + +This project is licensed under the Apache 2.0 license. See the text of the license [here](https://github.com/mkolod/mxnet_seq2seq/blob/master/LICENSE.txt). > **Note:** diff --git a/seq2seq/__init__.py b/__init__.py similarity index 100% rename from seq2seq/__init__.py rename to __init__.py diff --git a/seq2seq/attention_cell.py b/attention_cell.py similarity index 100% rename from seq2seq/attention_cell.py rename to attention_cell.py diff --git a/seq2seq/get_en_es_nmt_data.sh b/get_en_es_nmt_data.sh similarity index 100% rename from seq2seq/get_en_es_nmt_data.sh rename to get_en_es_nmt_data.sh diff --git a/get_opennmt_data.sh b/get_opennmt_data.sh new file mode 100755 index 0000000..69e07aa --- /dev/null +++ b/get_opennmt_data.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +DATA_DIR_ROOT="./data" +DATA_DIR="${DATA_DIR_ROOT}/wmt15-de-en" + +mkdir -p ${DATA_DIR} + +pushd . > /dev/null + +cd ${DATA_DIR_ROOT} + +echo -e "\nDownloading dataset" + +wget https://s3.amazonaws.com/opennmt-trainingdata/wmt15-de-en.tgz + +echo -e "\nDecompressing dataset\n" + +tar xvf wmt15-de-en.tgz + +echo -e "\nConcatenating corpora" + +cd wmt15-de-en + +# concatenate corpora - note concatenation has to be in +# the same order for both languages + +# we will split this into training and validation sets +cat commoncrawl.de-en.de europarl-v7.de-en.de news-commentary-v10.de-en.de > train.de +# the test set already officially exists +mv newstest2013.de valid.de + +wget -O test.de https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.de + +# do the same thing to English corpora + +cat commoncrawl.de-en.en europarl-v7.de-en.en news-commentary-v10.de-en.en > train.en +mv newstest2013.en valid.en + +wget -O test.en https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.en + +popd > /dev/null + +echo -e "\nData download complete\n" diff --git a/jupyter_notebook_config.py b/jupyter_notebook_config.py deleted file mode 100755 index 646be37..0000000 --- a/jupyter_notebook_config.py +++ /dev/null @@ -1,584 +0,0 @@ -# Configuration file for jupyter-notebook. - -#------------------------------------------------------------------------------ -# Application(SingletonConfigurable) configuration -#------------------------------------------------------------------------------ - -## This is an application. - -## The date format used by logging formatters for %(asctime)s -#c.Application.log_datefmt = '%Y-%m-%d %H:%M:%S' - -## The Logging format template -#c.Application.log_format = '[%(name)s]%(highlevel)s %(message)s' - -## Set the log level by value or name. -#c.Application.log_level = 30 - -#------------------------------------------------------------------------------ -# JupyterApp(Application) configuration -#------------------------------------------------------------------------------ - -## Base class for Jupyter applications - -## Answer yes to any prompts. -#c.JupyterApp.answer_yes = False - -## Full path of a config file. -#c.JupyterApp.config_file = u'' - -## Specify a config file to load. -#c.JupyterApp.config_file_name = u'' - -## Generate default config file. -#c.JupyterApp.generate_config = False - -#------------------------------------------------------------------------------ -# NotebookApp(JupyterApp) configuration -#------------------------------------------------------------------------------ - -## Set the Access-Control-Allow-Credentials: true header -#c.NotebookApp.allow_credentials = False - -## Set the Access-Control-Allow-Origin header -# -# Use '*' to allow any origin to access your server. -# -# Takes precedence over allow_origin_pat. -#c.NotebookApp.allow_origin = '' - -## Use a regular expression for the Access-Control-Allow-Origin header -# -# Requests from an origin matching the expression will get replies with: -# -# Access-Control-Allow-Origin: origin -# -# where `origin` is the origin of the request. -# -# Ignored if allow_origin is set. -#c.NotebookApp.allow_origin_pat = '' - -## DEPRECATED use base_url -#c.NotebookApp.base_project_url = '/' - -## The base URL for the notebook server. -# -# Leading and trailing slashes can be omitted, and will automatically be added. -#c.NotebookApp.base_url = '/' - -## Specify what command to use to invoke a web browser when opening the notebook. -# If not specified, the default browser will be determined by the `webbrowser` -# standard library module, which allows setting of the BROWSER environment -# variable to override it. -#c.NotebookApp.browser = u'' - -## The full path to an SSL/TLS certificate file. -#c.NotebookApp.certfile = u'' - -## The full path to a certificate authority certificate for SSL/TLS client -# authentication. -#c.NotebookApp.client_ca = u'' - -## The config manager class to use -#c.NotebookApp.config_manager_class = 'notebook.services.config.manager.ConfigManager' - -## The notebook manager class to use. -#c.NotebookApp.contents_manager_class = 'notebook.services.contents.filemanager.FileContentsManager' - -## Extra keyword arguments to pass to `set_secure_cookie`. See tornado's -# set_secure_cookie docs for details. -#c.NotebookApp.cookie_options = {} - -## The random bytes used to secure cookies. By default this is a new random -# number every time you start the Notebook. Set it to a value in a config file -# to enable logins to persist across server sessions. -# -# Note: Cookie secrets should be kept private, do not share config files with -# cookie_secret stored in plaintext (you can read the value from a file). -#c.NotebookApp.cookie_secret = '' - -## The file where the cookie secret is stored. -#c.NotebookApp.cookie_secret_file = u'' - -## The default URL to redirect to from `/` -#c.NotebookApp.default_url = '/tree' - -## Disable cross-site-request-forgery protection -# -# Jupyter notebook 4.3.1 introduces protection from cross-site request -# forgeries, requiring API requests to either: -# -# - originate from pages served by this server (validated with XSRF cookie and -# token), or - authenticate with a token -# -# Some anonymous compute resources still desire the ability to run code, -# completely without authentication. These services can disable all -# authentication and security checks, with the full knowledge of what that -# implies. -#c.NotebookApp.disable_check_xsrf = False - -## Whether to enable MathJax for typesetting math/TeX -# -# MathJax is the javascript library Jupyter uses to render math/LaTeX. It is -# very large, so you may want to disable it if you have a slow internet -# connection, or for offline use of the notebook. -# -# When disabled, equations etc. will appear as their untransformed TeX source. -#c.NotebookApp.enable_mathjax = True - -## extra paths to look for Javascript notebook extensions -#c.NotebookApp.extra_nbextensions_path = [] - -## Extra paths to search for serving static files. -# -# This allows adding javascript/css to be available from the notebook server -# machine, or overriding individual files in the IPython -#c.NotebookApp.extra_static_paths = [] - -## Extra paths to search for serving jinja templates. -# -# Can be used to override templates from notebook.templates. -#c.NotebookApp.extra_template_paths = [] - -## -#c.NotebookApp.file_to_run = '' - -## Use minified JS file or not, mainly use during dev to avoid JS recompilation -#c.NotebookApp.ignore_minified_js = False - -## (bytes/sec) Maximum rate at which messages can be sent on iopub before they -# are limited. -#c.NotebookApp.iopub_data_rate_limit = 0 - -## (msg/sec) Maximum rate at which messages can be sent on iopub before they are -# limited. -#c.NotebookApp.iopub_msg_rate_limit = 0 - -## The IP address the notebook server will listen on. -c.NotebookApp.ip = '*' - -## Supply extra arguments that will be passed to Jinja environment. -#c.NotebookApp.jinja_environment_options = {} - -## Extra variables to supply to jinja templates when rendering. -#c.NotebookApp.jinja_template_vars = {} - -## The kernel manager class to use. -#c.NotebookApp.kernel_manager_class = 'notebook.services.kernels.kernelmanager.MappingKernelManager' - -## The kernel spec manager class to use. Should be a subclass of -# `jupyter_client.kernelspec.KernelSpecManager`. -# -# The Api of KernelSpecManager is provisional and might change without warning -# between this version of Jupyter and the next stable one. -#c.NotebookApp.kernel_spec_manager_class = 'jupyter_client.kernelspec.KernelSpecManager' - -## The full path to a private key file for usage with SSL/TLS. -#c.NotebookApp.keyfile = u'' - -## The login handler class to use. -#c.NotebookApp.login_handler_class = 'notebook.auth.login.LoginHandler' - -## The logout handler class to use. -#c.NotebookApp.logout_handler_class = 'notebook.auth.logout.LogoutHandler' - -## A custom url for MathJax.js. Should be in the form of a case-sensitive url to -# MathJax, for example: /static/components/MathJax/MathJax.js -#c.NotebookApp.mathjax_url = '' - -## Dict of Python modules to load as notebook server extensions.Entry values can -# be used to enable and disable the loading ofthe extensions. The extensions -# will be loaded in alphabetical order. -#c.NotebookApp.nbserver_extensions = {} - -## The directory to use for notebooks and kernels. -#c.NotebookApp.notebook_dir = u'' - -## Whether to open in a browser after starting. The specific browser used is -# platform dependent and determined by the python standard library `webbrowser` -# module, unless it is overridden using the --browser (NotebookApp.browser) -# configuration option. -c.NotebookApp.open_browser = False - -## Hashed password to use for web authentication. -# -# To generate, type in a python/IPython shell: -# -# from notebook.auth import passwd; passwd() -# -# The string should be of the form type:salt:hashed-password. -#c.NotebookApp.password = u'' - -## The port the notebook server will listen on. -c.NotebookApp.port = 8888 - -## The number of additional ports to try if the specified port is not available. -#c.NotebookApp.port_retries = 50 - -## DISABLED: use %pylab or %matplotlib in the notebook to enable matplotlib. -#c.NotebookApp.pylab = 'disabled' - -## (sec) Time window used to check the message and data rate limits. -#c.NotebookApp.rate_limit_window = 1.0 - -## Reraise exceptions encountered loading server extensions? -#c.NotebookApp.reraise_server_extension_failures = False - -## DEPRECATED use the nbserver_extensions dict instead -#c.NotebookApp.server_extensions = [] - -## The session manager class to use. -#c.NotebookApp.session_manager_class = 'notebook.services.sessions.sessionmanager.SessionManager' - -## Supply SSL options for the tornado HTTPServer. See the tornado docs for -# details. -#c.NotebookApp.ssl_options = {} - -## Token used for authenticating first-time connections to the server. -# -# When no password is enabled, the default is to generate a new, random token. -# -# Setting to an empty string disables authentication altogether, which is NOT -# RECOMMENDED. -#c.NotebookApp.token = '' - -## Supply overrides for the tornado.web.Application that the Jupyter notebook -# uses. -#c.NotebookApp.tornado_settings = {} - -## Whether to trust or not X-Scheme/X-Forwarded-Proto and X-Real-Ip/X-Forwarded- -# For headerssent by the upstream reverse proxy. Necessary if the proxy handles -# SSL -#c.NotebookApp.trust_xheaders = False - -## DEPRECATED, use tornado_settings -#c.NotebookApp.webapp_settings = {} - -## The base URL for websockets, if it differs from the HTTP server (hint: it -# almost certainly doesn't). -# -# Should be in the form of an HTTP origin: ws[s]://hostname[:port] -#c.NotebookApp.websocket_url = '' - -#------------------------------------------------------------------------------ -# ConnectionFileMixin(LoggingConfigurable) configuration -#------------------------------------------------------------------------------ - -## Mixin for configurable classes that work with connection files - -## JSON file in which to store connection info [default: kernel-.json] -# -# This file will contain the IP, ports, and authentication key needed to connect -# clients to this kernel. By default, this file will be created in the security -# dir of the current profile, but can be specified by absolute path. -#c.ConnectionFileMixin.connection_file = '' - -## set the control (ROUTER) port [default: random] -#c.ConnectionFileMixin.control_port = 0 - -## set the heartbeat port [default: random] -#c.ConnectionFileMixin.hb_port = 0 - -## set the iopub (PUB) port [default: random] -#c.ConnectionFileMixin.iopub_port = 0 - -## Set the kernel's IP address [default localhost]. If the IP address is -# something other than localhost, then Consoles on other machines will be able -# to connect to the Kernel, so be careful! -#c.ConnectionFileMixin.ip = u'' - -## set the shell (ROUTER) port [default: random] -#c.ConnectionFileMixin.shell_port = 0 - -## set the stdin (ROUTER) port [default: random] -#c.ConnectionFileMixin.stdin_port = 0 - -## -#c.ConnectionFileMixin.transport = 'tcp' - -#------------------------------------------------------------------------------ -# KernelManager(ConnectionFileMixin) configuration -#------------------------------------------------------------------------------ - -## Manages a single kernel in a subprocess on this host. -# -# This version starts kernels with Popen. - -## Should we autorestart the kernel if it dies. -#c.KernelManager.autorestart = True - -## DEPRECATED: Use kernel_name instead. -# -# The Popen Command to launch the kernel. Override this if you have a custom -# kernel. If kernel_cmd is specified in a configuration file, Jupyter does not -# pass any arguments to the kernel, because it cannot make any assumptions about -# the arguments that the kernel understands. In particular, this means that the -# kernel does not receive the option --debug if it given on the Jupyter command -# line. -#c.KernelManager.kernel_cmd = [] - -## Time to wait for a kernel to terminate before killing it, in seconds. -#c.KernelManager.shutdown_wait_time = 5.0 - -#------------------------------------------------------------------------------ -# Session(Configurable) configuration -#------------------------------------------------------------------------------ - -## Object for handling serialization and sending of messages. -# -# The Session object handles building messages and sending them with ZMQ sockets -# or ZMQStream objects. Objects can communicate with each other over the -# network via Session objects, and only need to work with the dict-based IPython -# message spec. The Session will handle serialization/deserialization, security, -# and metadata. -# -# Sessions support configurable serialization via packer/unpacker traits, and -# signing with HMAC digests via the key/keyfile traits. -# -# Parameters ---------- -# -# debug : bool -# whether to trigger extra debugging statements -# packer/unpacker : str : 'json', 'pickle' or import_string -# importstrings for methods to serialize message parts. If just -# 'json' or 'pickle', predefined JSON and pickle packers will be used. -# Otherwise, the entire importstring must be used. -# -# The functions must accept at least valid JSON input, and output *bytes*. -# -# For example, to use msgpack: -# packer = 'msgpack.packb', unpacker='msgpack.unpackb' -# pack/unpack : callables -# You can also set the pack/unpack callables for serialization directly. -# session : bytes -# the ID of this Session object. The default is to generate a new UUID. -# username : unicode -# username added to message headers. The default is to ask the OS. -# key : bytes -# The key used to initialize an HMAC signature. If unset, messages -# will not be signed or checked. -# keyfile : filepath -# The file containing a key. If this is set, `key` will be initialized -# to the contents of the file. - -## Threshold (in bytes) beyond which an object's buffer should be extracted to -# avoid pickling. -#c.Session.buffer_threshold = 1024 - -## Whether to check PID to protect against calls after fork. -# -# This check can be disabled if fork-safety is handled elsewhere. -#c.Session.check_pid = True - -## Threshold (in bytes) beyond which a buffer should be sent without copying. -#c.Session.copy_threshold = 65536 - -## Debug output in the Session -#c.Session.debug = False - -## The maximum number of digests to remember. -# -# The digest history will be culled when it exceeds this value. -#c.Session.digest_history_size = 65536 - -## The maximum number of items for a container to be introspected for custom -# serialization. Containers larger than this are pickled outright. -#c.Session.item_threshold = 64 - -## execution key, for signing messages. -#c.Session.key = '' - -## path to file containing execution key. -#c.Session.keyfile = '' - -## Metadata dictionary, which serves as the default top-level metadata dict for -# each message. -#c.Session.metadata = {} - -## The name of the packer for serializing messages. Should be one of 'json', -# 'pickle', or an import name for a custom callable serializer. -#c.Session.packer = 'json' - -## The UUID identifying this session. -#c.Session.session = u'' - -## The digest scheme used to construct the message signatures. Must have the form -# 'hmac-HASH'. -#c.Session.signature_scheme = 'hmac-sha256' - -## The name of the unpacker for unserializing messages. Only used with custom -# functions for `packer`. -#c.Session.unpacker = 'json' - -## Username for the Session. Default is your system username. -#c.Session.username = u'username' - -#------------------------------------------------------------------------------ -# MultiKernelManager(LoggingConfigurable) configuration -#------------------------------------------------------------------------------ - -## A class for managing multiple kernels. - -## The name of the default kernel to start -#c.MultiKernelManager.default_kernel_name = 'python2' - -## The kernel manager class. This is configurable to allow subclassing of the -# KernelManager for customized behavior. -#c.MultiKernelManager.kernel_manager_class = 'jupyter_client.ioloop.IOLoopKernelManager' - -#------------------------------------------------------------------------------ -# MappingKernelManager(MultiKernelManager) configuration -#------------------------------------------------------------------------------ - -## A KernelManager that handles notebook mapping and HTTP error handling - -## -#c.MappingKernelManager.root_dir = u'' - -#------------------------------------------------------------------------------ -# ContentsManager(LoggingConfigurable) configuration -#------------------------------------------------------------------------------ - -## Base class for serving files and directories. -# -# This serves any text or binary file, as well as directories, with special -# handling for JSON notebook documents. -# -# Most APIs take a path argument, which is always an API-style unicode path, and -# always refers to a directory. -# -# - unicode, not url-escaped -# - '/'-separated -# - leading and trailing '/' will be stripped -# - if unspecified, path defaults to '', -# indicating the root path. - -## -#c.ContentsManager.checkpoints = None - -## -#c.ContentsManager.checkpoints_class = 'notebook.services.contents.checkpoints.Checkpoints' - -## -#c.ContentsManager.checkpoints_kwargs = {} - -## Glob patterns to hide in file and directory listings. -#c.ContentsManager.hide_globs = [u'__pycache__', '*.pyc', '*.pyo', '.DS_Store', '*.so', '*.dylib', '*~'] - -## Python callable or importstring thereof -# -# To be called on a contents model prior to save. -# -# This can be used to process the structure, such as removing notebook outputs -# or other side effects that should not be saved. -# -# It will be called as (all arguments passed by keyword):: -# -# hook(path=path, model=model, contents_manager=self) -# -# - model: the model to be saved. Includes file contents. -# Modifying this dict will affect the file that is stored. -# - path: the API path of the save destination -# - contents_manager: this ContentsManager instance -#c.ContentsManager.pre_save_hook = None - -## The base name used when creating untitled directories. -#c.ContentsManager.untitled_directory = 'Untitled Folder' - -## The base name used when creating untitled files. -#c.ContentsManager.untitled_file = 'untitled' - -## The base name used when creating untitled notebooks. -#c.ContentsManager.untitled_notebook = 'Untitled' - -#------------------------------------------------------------------------------ -# FileManagerMixin(Configurable) configuration -#------------------------------------------------------------------------------ - -## Mixin for ContentsAPI classes that interact with the filesystem. -# -# Provides facilities for reading, writing, and copying both notebooks and -# generic files. -# -# Shared by FileContentsManager and FileCheckpoints. -# -# Note ---- Classes using this mixin must provide the following attributes: -# -# root_dir : unicode -# A directory against against which API-style paths are to be resolved. -# -# log : logging.Logger - -## By default notebooks are saved on disk on a temporary file and then if -# succefully written, it replaces the old ones. This procedure, namely -# 'atomic_writing', causes some bugs on file system whitout operation order -# enforcement (like some networked fs). If set to False, the new notebook is -# written directly on the old one which could fail (eg: full filesystem or quota -# ) -#c.FileManagerMixin.use_atomic_writing = True - -#------------------------------------------------------------------------------ -# FileContentsManager(FileManagerMixin,ContentsManager) configuration -#------------------------------------------------------------------------------ - -## Python callable or importstring thereof -# -# to be called on the path of a file just saved. -# -# This can be used to process the file on disk, such as converting the notebook -# to a script or HTML via nbconvert. -# -# It will be called as (all arguments passed by keyword):: -# -# hook(os_path=os_path, model=model, contents_manager=instance) -# -# - path: the filesystem path to the file just written - model: the model -# representing the file - contents_manager: this ContentsManager instance -#c.FileContentsManager.post_save_hook = None - -## -#c.FileContentsManager.root_dir = u'' - -## DEPRECATED, use post_save_hook. Will be removed in Notebook 5.0 -#c.FileContentsManager.save_script = False - -#------------------------------------------------------------------------------ -# NotebookNotary(LoggingConfigurable) configuration -#------------------------------------------------------------------------------ - -## A class for computing and verifying notebook signatures. - -## The hashing algorithm used to sign notebooks. -#c.NotebookNotary.algorithm = 'sha256' - -## The sqlite file in which to store notebook signatures. By default, this will -# be in your Jupyter data directory. You can set it to ':memory:' to disable -# sqlite writing to the filesystem. -#c.NotebookNotary.db_file = u'' - -## The secret key with which notebooks are signed. -#c.NotebookNotary.secret = '' - -## The file where the secret key is stored. -#c.NotebookNotary.secret_file = u'' - -## A callable returning the storage backend for notebook signatures. The default -# uses an SQLite database. -#c.NotebookNotary.store_factory = traitlets.Undefined - -#------------------------------------------------------------------------------ -# KernelSpecManager(LoggingConfigurable) configuration -#------------------------------------------------------------------------------ - -## If there is no Python kernelspec registered and the IPython kernel is -# available, ensure it is added to the spec list. -#c.KernelSpecManager.ensure_native_kernel = True - -## The kernel spec class. This is configurable to allow subclassing of the -# KernelSpecManager for customized behavior. -#c.KernelSpecManager.kernel_spec_class = 'jupyter_client.kernelspec.KernelSpec' - -## Whitelist of allowed kernel names. -# -# By default, all installed kernels are allowed. -#c.KernelSpecManager.whitelist = set([]) diff --git a/model_infer_gpu.sh b/model_infer_gpu.sh new file mode 100755 index 0000000..3049bcb --- /dev/null +++ b/model_infer_gpu.sh @@ -0,0 +1,10 @@ +#!/bin/bash +#!/bin/bash +python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 2 --gpus 0,1 --batch-size 8 \ + --disp-batches 1 --num-epochs 1 --model-prefix trained_model --dropout 0.0 \ + --infer --load-epoch 1 --attention +# --attention +# --input-feed --remove-state-feed +# --use-cudnn-cells +# --use-cudnn-cells +# --inference-unrolling-for-training diff --git a/seq2seq/model_train_cpu.sh b/model_train_cpu.sh similarity index 100% rename from seq2seq/model_train_cpu.sh rename to model_train_cpu.sh diff --git a/model_train_gpu.sh b/model_train_gpu.sh new file mode 100755 index 0000000..6449087 --- /dev/null +++ b/model_train_gpu.sh @@ -0,0 +1,9 @@ +#!/bin/bash +python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 2 --gpus 0,1 --batch-size 256 \ + --optimizer adagrad --lr 0.01 --disp-batches 10 --num-epochs 10 \ + --dropout 0.3 --seed 1234 --model-prefix trained_model --model-prefix trained_model --attention --input-feed --remove-state-feed +# --attention +# --input-feed --remove-state-feed +# --model-prefix trained_model +# --use-cudnn-cells +# --inference-unrolling-for-training diff --git a/model_train_gpu_alt_unrolling.sh b/model_train_gpu_alt_unrolling.sh new file mode 100755 index 0000000..9d7f816 --- /dev/null +++ b/model_train_gpu_alt_unrolling.sh @@ -0,0 +1,7 @@ +#!/bin/bash +python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 2 --gpus 0 --batch-size 128 \ + --optimizer adagrad --lr 0.141 --disp-batches 100 --num-epochs 1 --model-prefix trained_model \ + --dropout 0.3 --seed 1234 + +# --inference-unrolling-for-training + diff --git a/nan.patch b/nan.patch deleted file mode 100644 index e858c1e..0000000 --- a/nan.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py -index 396f5a1..544eab2 100644 ---- a/python/mxnet/callback.py -+++ b/python/mxnet/callback.py -@@ -96,13 +96,16 @@ class Speedometer(object): - frequent: int - How many batches between calculations. - Defaults to calculating & logging every 50 batches. -+ auto_reset : bool -+ Reset the metric after each log. - """ -- def __init__(self, batch_size, frequent=50): -+ def __init__(self, batch_size, frequent=50, auto_reset=False): - self.batch_size = batch_size - self.frequent = frequent - self.init = False - self.tic = 0 - self.last_count = 0 -+ self.auto_reset = auto_reset - - def __call__(self, param): - """Callback to Show speed.""" -@@ -116,7 +119,8 @@ class Speedometer(object): - speed = self.frequent * self.batch_size / (time.time() - self.tic) - if param.eval_metric is not None: - name_value = param.eval_metric.get_name_value() -- param.eval_metric.reset() -+ if self.auto_reset: -+ param.eval_metric.reset() - for name, value in name_value: - logging.info('Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f', - param.epoch, count, speed, name, value) -diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py -index 2870fab..8a8b974 100644 ---- a/python/mxnet/metric.py -+++ b/python/mxnet/metric.py -@@ -265,7 +265,8 @@ class Perplexity(EvalMetric): - self.num_inst += num - - def get(self): -- return (self.name, math.exp(self.sum_metric/self.num_inst)) -+ num = self.num_inst if self.num_inst > 0 else float('nan') -+ return (self.name, math.exp(self.sum_metric/num)) - - #################### - # REGRESSION METRICS diff --git a/seq2seq/preprocess_data.py b/preprocess_data.py similarity index 65% rename from seq2seq/preprocess_data.py rename to preprocess_data.py index 274b7f1..7836dea 100644 --- a/seq2seq/preprocess_data.py +++ b/preprocess_data.py @@ -28,19 +28,14 @@ start = time() dataset = get_s2s_data( - src_train_path= './data/wmt15-de-en/train.en', - src_valid_path= './data/wmt15-de-en/valid.en', - targ_train_path= './data/wmt15-de-en/train.de', - targ_valid_path= './data/wmt15-de-en/valid.de' + src_train_path= './data/wmt15-de-en/train.de', + src_valid_path= './data/wmt15-de-en/valid.de', + src_test_path = './data/wmt15-de-en/test.de', + targ_train_path= './data/wmt15-de-en/train.en', + targ_valid_path= './data/wmt15-de-en/valid.en', + targ_test_path = './data/wmt15-de-en/test.en' ) -# dataset = get_s2s_data( -# src_train_path= './data/europarl-v7.es-en.en_train_small', -# src_valid_path= './data/europarl-v7.es-en.en_valid_small', -# targ_train_path= './data/europarl-v7.es-en.es_train_small', -# targ_valid_path= './data/europarl-v7.es-en.en_valid_small' -# ) - preproc_duration = time() - start print("\nPreprocessing data took %.4f seconds\n" % preproc_duration) @@ -59,11 +54,12 @@ # all_pairs = [(5, 5), (15,15), (20, 20)] # max_sent_len = 25 + batch_size=64 print("Constructing train iterator") start = time() train_iter = Seq2SeqIter(dataset.src_train_sent, dataset.targ_train_sent, dataset.src_vocab, dataset.inv_src_vocab, - dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=64, buckets=all_pairs, max_sent_len=max_len) + dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=batch_size, buckets=all_pairs, max_sent_len=max_len) train_iter.bucketize() train_iter_duration = time() - start @@ -80,7 +76,7 @@ print("Constructing valid iterator") valid_iter_duration = time() valid_iter = Seq2SeqIter(dataset.src_valid_sent, dataset.targ_valid_sent, dataset.src_vocab, dataset.inv_src_vocab, - dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=64, buckets=all_pairs, max_sent_len=50) + dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=batch_size, buckets=all_pairs, max_sent_len=max_len) valid_iter.bucketize() valid_iter_duration = time() - start @@ -89,7 +85,23 @@ print("Serializing validation set iterator.") start = time() with open('./data/valid_iterator.pkl', 'wb') as f: - pickle.dump(train_iter, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(valid_iter, f, pickle.HIGHEST_PROTOCOL) valid_ser_duration = time() - start print("\nSerializing validation set iterator took %.4f seconds\n" % valid_ser_duration) + print("Constructing test iterator") + test_iter_duration = time() + test_iter = Seq2SeqIter(dataset.src_test_sent, dataset.targ_test_sent, dataset.src_vocab, dataset.inv_src_vocab, + dataset.targ_vocab, dataset.inv_targ_vocab, layout='TN', batch_size=batch_size, buckets=all_pairs, max_sent_len=max_len) + + test_iter.bucketize() + test_iter_duration = time() - start + print("\nBucketizing data for test set iterator took %.4f seconds\n" % test_iter_duration) + + print("Serializing test set iterator.") + start = time() + with open('./data/test_iterator.pkl', 'wb') as f: + pickle.dump(test_iter, f, pickle.HIGHEST_PROTOCOL) + test_ser_duration = time() - start + print("\nSerializing test set iterator took %.4f seconds\n" % test_ser_duration) + diff --git a/seq2seq/get_opennmt_data.sh b/seq2seq/get_opennmt_data.sh deleted file mode 100755 index abad5d1..0000000 --- a/seq2seq/get_opennmt_data.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -DATA_DIR_ROOT="./data" -DATA_DIR="${DATA_DIR_ROOT}/wmt15-de-en" -IN_SRC_DATA_PATH="${DATA_DIR}/all.en" -IN_TARG_DATA_PATH="${DATA_DIR}/all.de" -OUT_SRC_TRAIN_PATH="${DATA_DIR}/train.en" -OUT_TARG_TRAIN_PATH="${DATA_DIR}/train.de" -OUT_SRC_VALID_PATH="${DATA_DIR}/valid.en" -OUT_TARG_VALID_PATH="${DATA_DIR}/valid.de" -VALIDATION_FRACTION=0.2 -SHUFFLE_SEED=42 - -mkdir -p ${DATA_DIR} - -pushd . > /dev/null - -cd ${DATA_DIR_ROOT} - -echo -e "\nDownloading dataset" - -wget https://s3.amazonaws.com/opennmt-trainingdata/wmt15-de-en.tgz - -echo -e "\nDecompressing dataset\n" - -tar xvf wmt15-de-en.tgz - -echo -e "\nConcatenating corpora" - -cd wmt15-de-en - -# concatenate corpora - note concatenation has to be in -# the same order for both languages - -# we will split this into training and validation sets -cat commoncrawl.de-en.de europarl-v7.de-en.de news-commentary-v10.de-en.de > all.de -# the test set already officially exists -mv newstest2013.de test.de - -# do the same thing to English corpora - -cat commoncrawl.de-en.en europarl-v7.de-en.en news-commentary-v10.de-en.en > all.en -mv newstest2013.en test.en - -popd > /dev/null - -echo -e "\nShuffling examples and splitting into training and validation samples" - - -# shuffle examples so validation data isn't completely from one -# corpus while training is from another - -python split_train_valid.py \ - --in-src-data-path ${IN_SRC_DATA_PATH} \ - --in-targ-data-path ${IN_TARG_DATA_PATH} \ - --out-src-train-path ${OUT_SRC_TRAIN_PATH} \ - --out-targ-train-path ${OUT_TARG_TRAIN_PATH} \ - --out-src-valid-path ${OUT_SRC_VALID_PATH} \ - --out-targ-valid-path ${OUT_TARG_VALID_PATH} \ - --validation-fraction 0.2 \ - --shuffle-seed 42 - - diff --git a/seq2seq/model_infer_gpu.sh b/seq2seq/model_infer_gpu.sh deleted file mode 100755 index 1755ec1..0000000 --- a/seq2seq/model_infer_gpu.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 1 --gpus 0,1 --batch-size 256 --disp-batches 1 --num-epochs 1 --model-prefix trained_model --dropout 0.5 --infer --load-epoch 1 -# --use-cudnn-cells diff --git a/seq2seq/model_train_gpu.sh b/seq2seq/model_train_gpu.sh deleted file mode 100755 index 3f4e987..0000000 --- a/seq2seq/model_train_gpu.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -python seq2seq_bucketing.py --num-hidden 512 --num-embed 512 --num-layers 22 --gpus 0,1 --batch-size 256 --optimizer adagrad --lr 0.1 --disp-batches 1 --num-epochs 1 --model-prefix trained_model --dropout 0.5 -# --use-cudnn-cells diff --git a/seq2seq/rnn_cell.py b/seq2seq/rnn_cell.py deleted file mode 100644 index f93072f..0000000 --- a/seq2seq/rnn_cell.py +++ /dev/null @@ -1,951 +0,0 @@ -# coding: utf-8 -# pylint: disable=no-member, invalid-name, protected-access, no-self-use -# pylint: disable=too-many-branches, too-many-arguments, no-self-use -# pylint: disable=too-many-lines -"""Definition of various recurrent neural network cells.""" -from __future__ import print_function - -import warnings - -import mxnet as mx -from mxnet import symbol, init, ndarray, _symbol_internal -from mxnet.base import string_types, numeric_types - - -def _cells_state_shape(cells): - return sum([c.state_shape for c in cells], []) - -def _cells_begin_state(cells, **kwargs): - return sum([c.begin_state(**kwargs) for c in cells], []) - -def _cells_unpack_weights(cells, args): - for cell in cells: - args = cell.unpack_weights(args) - return args - -def _cells_pack_weights(cells, args): - for cell in cells: - args = cell.pack_weights(args) - return args - -def _normalize_sequence(length, inputs, layout, merge, in_layout=None): - assert inputs is not None, \ - "unroll(inputs=None) has been deprecated. " \ - "Please create input variables outside unroll." - - axis = layout.find('T') - in_axis = in_layout.find('T') if in_layout is not None else axis - if isinstance(inputs, symbol.Symbol): - if merge is False: - assert len(inputs.list_outputs()) == 1, \ - "unroll doesn't allow grouped symbol as input. Please convert " \ - "to list with list(inputs) first or let unroll handle splitting." - inputs = list(symbol.split(inputs, axis=in_axis, num_outputs=length, - squeeze_axis=1)) - else: - assert length is None or len(inputs) == length - if merge is True: - inputs = [symbol.expand_dims(i, axis=axis) for i in inputs] - inputs = symbol.Concat(*inputs, dim=axis) - in_axis = axis - - if isinstance(inputs, symbol.Symbol) and axis != in_axis: - inputs = symbol.swapaxes(inputs, dim0=axis, dim1=in_axis) - - return inputs, axis - - -class RNNParams(object): - """Container for holding variables. - Used by RNN cells for parameter sharing between cells. - - Parameters - ---------- - prefix : str - All variables' name created by this container will - be prepended with prefix - """ - def __init__(self, prefix=''): - self._prefix = prefix - self._params = {} - - def get(self, name, **kwargs): - """Get a variable with name or create a new one if missing. - - Parameters - ---------- - name : str - name of the variable - **kwargs : - more arguments that's passed to symbol.Variable - """ - name = self._prefix + name - if name not in self._params: - self._params[name] = symbol.Variable(name, **kwargs) - return self._params[name] - - -class BaseRNNCell(object): - """Abstract base class for RNN cells - - Parameters - ---------- - prefix : str - prefix for name of layers - (and name of weight if params is None) - params : RNNParams or None - container for weight sharing between cells. - created if None. - """ - def __init__(self, prefix='', params=None): - if params is None: - params = RNNParams(prefix) - self._own_params = True - else: - self._own_params = False - self._prefix = prefix - self._params = params - self._modified = False - - self.reset() - - def reset(self): - """Reset before re-using the cell for another graph""" - self._init_counter = -1 - self._counter = -1 - - def __call__(self, inputs, states): - """Construct symbol for one step of RNN. - - Parameters - ---------- - inputs : sym.Variable - input symbol, 2D, batch * num_units - states : sym.Variable - state from previous step or begin_state(). - - Returns - ------- - output : Symbol - output symbol - states : Symbol - state to next step of RNN. - """ - raise NotImplementedError() - - @property - def params(self): - """Parameters of this cell""" - self._own_params = False - return self._params - - @property - def state_shape(self): - """shape(s) of states""" - raise NotImplementedError() - - @property - def _gate_names(self): - """name(s) of gates""" - return () - - def begin_state(self, func=symbol.zeros, **kwargs): - """Initial state for this cell. - - Parameters - ---------- - func : callable, default symbol.zeros - Function for creating initial state. Can be symbol.zeros, - symbol.uniform, symbol.Variable etc. - Use symbol.Variable if you want to directly - feed input as states. - **kwargs : - more keyword arguments passed to func. For example - mean, std, dtype, etc. - - Returns - ------- - states : nested list of Symbol - starting states for first RNN step - """ - assert not self._modified, \ - "After applying modifier cells (e.g. DropoutCell) the base " \ - "cell cannot be called directly. Call the modifier cell instead." - states = [] - for shape in self.state_shape: - self._init_counter += 1 - if shape is None: - state = func(name='%sbegin_state_%d'%(self._prefix, self._init_counter), - **kwargs) - else: - state = func(name='%sbegin_state_%d'%(self._prefix, self._init_counter), - shape=shape, **kwargs) - states.append(state) - return states - - def unpack_weights(self, args): - """Unpack fused weight matrices into separate - weight matrices - - Parameters - ---------- - args : dict of str -> NDArray - dictionary containing packed weights. - usually from Module.get_output() - - Returns - ------- - args : dict of str -> NDArray - dictionary with weights associated to - this cell unpacked. - """ - args = args.copy() - if not self._gate_names: - return args - h = self._num_hidden - for group_name in ['i2h', 'h2h']: - weight = args.pop('%s%s_weight'%(self._prefix, group_name)) - bias = args.pop('%s%s_bias' % (self._prefix, group_name)) - for j, gate in enumerate(self._gate_names): - wname = '%s%s%s_weight' % (self._prefix, group_name, gate) - args[wname] = weight[j*h:(j+1)*h].copy() - bname = '%s%s%s_bias' % (self._prefix, group_name, gate) - args[bname] = bias[j*h:(j+1)*h].copy() - return args - - def pack_weights(self, args): - """Pack separate weight matrices into fused - weight. - - Parameters - ---------- - args : dict of str -> NDArray - dictionary containing unpacked weights. - - Returns - ------- - args : dict of str -> NDArray - dictionary with weights associated to - this cell packed. - """ - args = args.copy() - if not self._gate_names: - return args - for group_name in ['i2h', 'h2h']: - weight = [] - bias = [] - for gate in self._gate_names: - wname = '%s%s%s_weight'%(self._prefix, group_name, gate) - weight.append(args.pop(wname)) - bname = '%s%s%s_bias'%(self._prefix, group_name, gate) - bias.append(args.pop(bname)) - args['%s%s_weight'%(self._prefix, group_name)] = ndarray.concatenate(weight) - args['%s%s_bias'%(self._prefix, group_name)] = ndarray.concatenate(bias) - return args - - def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None): - """Unroll an RNN cell across time steps. - - Parameters - ---------- - length : int - number of steps to unroll - inputs : Symbol, list of Symbol, or None - if inputs is a single Symbol (usually the output - of Embedding symbol), it should have shape - (batch_size, length, ...) if layout == 'NTC', - or (length, batch_size, ...) if layout == 'TNC'. - - If inputs is a list of symbols (usually output of - previous unroll), they should all have shape - (batch_size, ...). - begin_state : nested list of Symbol - input states. Created by begin_state() - or output state of another cell. Created - from begin_state() if None. - layout : str - layout of input symbol. Only used if inputs - is a single Symbol. - merge_outputs : bool - If False, return outputs as a list of Symbols. - If True, concatenate output across time steps - and return a single symbol with shape - (batch_size, length, ...) if layout == 'NTC', - or (length, batch_size, ...) if layout == 'TNC'. - If None, output whatever is faster - - Returns - ------- - outputs : list of Symbol - output symbols. - states : Symbol or nested list of Symbol - has the same structure as begin_state() - """ - self.reset() - - inputs, _ = _normalize_sequence(length, inputs, layout, False) - if begin_state is None: - begin_state = self.begin_state() - - states = begin_state - outputs = [] - for i in range(length): - output, states = self(inputs[i], states) - outputs.append(output) - - outputs, _ = _normalize_sequence(length, outputs, layout, merge_outputs) - - return outputs, states - - #pylint: disable=no-self-use - def _get_activation(self, inputs, activation, **kwargs): - """Get activation function. Convert if is string""" - if isinstance(activation, string_types): - return symbol.Activation(inputs, act_type=activation, **kwargs) - else: - return activation(inputs, **kwargs) - - -class RNNCell(BaseRNNCell): - """Simple recurrent neural network cell - - Parameters - ---------- - num_hidden : int - number of units in output symbol - activation : str or Symbol, default 'tanh' - type of activation function - prefix : str, default 'rnn_' - prefix for name of layers - (and name of weight if params is None) - params : RNNParams or None - container for weight sharing between cells. - created if None. - """ - def __init__(self, num_hidden, activation='tanh', prefix='rnn_', params=None): - super(RNNCell, self).__init__(prefix=prefix, params=params) - self._num_hidden = num_hidden - self._activation = activation - self._iW = self.params.get('i2h_weight') - self._iB = self.params.get('i2h_bias') - self._hW = self.params.get('h2h_weight') - self._hB = self.params.get('h2h_bias') - - @property - def state_shape(self): - return [(0, self._num_hidden)] - - @property - def _gate_names(self): - return ('',) - - def __call__(self, inputs, states): - self._counter += 1 - name = '%st%d_'%(self._prefix, self._counter) - i2h = symbol.FullyConnected(data=inputs, weight=self._iW, bias=self._iB, - num_hidden=self._num_hidden, - name='%si2h'%name) - h2h = symbol.FullyConnected(data=states[0], weight=self._hW, bias=self._hB, - num_hidden=self._num_hidden, - name='%sh2h'%name) - output = self._get_activation(i2h + h2h, self._activation, - name='%sout'%name) - - return output, [output] - - -class LSTMCell(BaseRNNCell): - """Long-Short Term Memory (LSTM) network cell. - - Parameters - ---------- - num_hidden : int - number of units in output symbol - prefix : str, default 'rnn_' - prefix for name of layers - (and name of weight if params is None) - params : RNNParams or None - container for weight sharing between cells. - created if None. - forget_bias : bias added to forget gate, default 1.0. - Jozefowicz et al. 2015 recommends setting this to 1.0 - """ - def __init__(self, num_hidden, prefix='lstm_', params=None, forget_bias=1.0): - super(LSTMCell, self).__init__(prefix=prefix, params=params) - - self._num_hidden = num_hidden - self._iW = self.params.get('i2h_weight') - self._hW = self.params.get('h2h_weight') - # we add the forget_bias to i2h_bias, this adds the bias to the forget gate activation - self._iB = self.params.get('i2h_bias', init=init.LSTMBias(forget_bias=forget_bias)) - self._hB = self.params.get('h2h_bias') - - @property - def state_shape(self): - return [(0, self._num_hidden), (0, self._num_hidden)] - - @property - def _gate_names(self): - return ['_i', '_f', '_c', '_o'] - - def __call__(self, inputs, states): - self._counter += 1 - name = '%st%d_'%(self._prefix, self._counter) - i2h = symbol.FullyConnected(data=inputs, weight=self._iW, bias=self._iB, - num_hidden=self._num_hidden*4, - name='%si2h'%name) - h2h = symbol.FullyConnected(data=states[0], weight=self._hW, bias=self._hB, - num_hidden=self._num_hidden*4, - name='%sh2h'%name) - gates = i2h + h2h - slice_gates = symbol.SliceChannel(gates, num_outputs=4, - name="%sslice"%name) - in_gate = symbol.Activation(slice_gates[0], act_type="sigmoid", - name='%si'%name) - forget_gate = symbol.Activation(slice_gates[1], act_type="sigmoid", - name='%sf'%name) - in_transform = symbol.Activation(slice_gates[2], act_type="tanh", - name='%sc'%name) - out_gate = symbol.Activation(slice_gates[3], act_type="sigmoid", - name='%so'%name) - next_c = symbol._internal._plus(forget_gate * states[1], in_gate * in_transform, - name='%sstate'%name) - next_h = symbol._internal._mul(out_gate, symbol.Activation(next_c, act_type="tanh"), - name='%sout'%name) - - return next_h, [next_h, next_c] - - -class GRUCell(BaseRNNCell): - """Gated Rectified Unit (GRU) network cell. - Note: this is an implementation of the cuDNN version of GRUs - (slight modification compared to Cho et al. 2014). - - Parameters - ---------- - num_hidden : int - number of units in output symbol - prefix : str, default 'gru_' - prefix for name of layers - (and name of weight if params is None) - params : RNNParams or None - container for weight sharing between cells. - created if None. - """ - def __init__(self, num_hidden, prefix='gru_', params=None): - super(GRUCell, self).__init__(prefix=prefix, params=params) - self._num_hidden = num_hidden - self._iW = self.params.get("i2h_weight") - self._iB = self.params.get("i2h_bias") - self._hW = self.params.get("h2h_weight") - self._hB = self.params.get("h2h_bias") - - @property - def state_shape(self): - return [(0, self._num_hidden)] - - @property - def _gate_names(self): - return ['_r', '_z', '_o'] - - def __call__(self, inputs, states): - # pylint: disable=too-many-locals - self._counter += 1 - - seq_idx = self._counter - name = '%st%d_' % (self._prefix, seq_idx) - prev_state_h = states[0] - - i2h = symbol.FullyConnected(data=inputs, - weight=self._iW, - bias=self._iB, - num_hidden=self._num_hidden * 3, - name="%s_i2h" % name) - h2h = symbol.FullyConnected(data=prev_state_h, - weight=self._hW, - bias=self._hB, - num_hidden=self._num_hidden * 3, - name="%s_h2h" % name) - - i2h_r, i2h_z, i2h = symbol.SliceChannel(i2h, num_outputs=3, name="%s_i2h_slice" % name) - h2h_r, h2h_z, h2h = symbol.SliceChannel(h2h, num_outputs=3, name="%s_h2h_slice" % name) - - reset_gate = symbol.Activation(i2h_r + h2h_r, act_type="sigmoid", - name="%s_r_act" % name) - update_gate = symbol.Activation(i2h_z + h2h_z, act_type="sigmoid", - name="%s_z_act" % name) - - next_h_tmp = symbol.Activation(i2h + reset_gate * h2h, act_type="tanh", - name="%s_h_act" % name) - - next_h = symbol._internal._plus((1. - update_gate) * next_h_tmp, update_gate * prev_state_h, - name='%sout' % name) - - return next_h, [next_h] - - -class FusedRNNCell(BaseRNNCell): - """Fusing RNN layers across time step into one kernel. - Improves speed but is less flexible. Currently only - supported if using cuDNN on GPU. - - Parameters - ---------- - """ - def __init__(self, num_hidden, num_layers=1, mode='lstm', bidirectional=False, - dropout=0., get_next_state=False, forget_bias=1.0, - prefix=None, params=None): - if prefix is None: - prefix = '%s_'%mode - super(FusedRNNCell, self).__init__(prefix=prefix, params=params) - self._num_hidden = num_hidden - self._num_layers = num_layers - self._mode = mode - self._bidirectional = bidirectional - self._dropout = dropout - self._get_next_state = get_next_state - self._directions = ['l', 'r'] if bidirectional else ['l'] - - initializer = init.FusedRNN(None, num_hidden, num_layers, mode, - bidirectional, forget_bias) - self._parameter = self.params.get('parameters', init=initializer) - - @property - def state_shape(self): - b = self._bidirectional + 1 - n = (self._mode == 'lstm') + 1 - return [(b*self._num_layers, 0, self._num_hidden)]*n - - @property - def _gate_names(self): - return {'rnn_relu': [''], - 'rnn_tanh': [''], - 'lstm': ['_i', '_f', '_c', '_o'], - 'gru': ['_r', '_z', '_o']}[self._mode] - - @property - def _num_gates(self): - return len(self._gate_names) - - def _slice_weights(self, arr, li, lh): - """slice fused rnn weights""" - args = {} - gate_names = self._gate_names - directions = self._directions - - b = len(directions) - p = 0 - for layer in range(self._num_layers): - for direction in directions: - for gate in gate_names: - name = '%s%s%d_i2h%s_weight'%(self._prefix, direction, layer, gate) - if layer > 0: - size = b*lh*lh - args[name] = arr[p:p+size].reshape((lh, b*lh)) - else: - size = li*lh - args[name] = arr[p:p+size].reshape((lh, li)) - p += size - for gate in gate_names: - name = '%s%s%d_h2h%s_weight'%(self._prefix, direction, layer, gate) - size = lh**2 - args[name] = arr[p:p+size].reshape((lh, lh)) - p += size - - for layer in range(self._num_layers): - for direction in directions: - for gate in gate_names: - name = '%s%s%d_i2h%s_bias'%(self._prefix, direction, layer, gate) - args[name] = arr[p:p+lh] - p += lh - for gate in gate_names: - name = '%s%s%d_h2h%s_bias'%(self._prefix, direction, layer, gate) - args[name] = arr[p:p+lh] - p += lh - - assert p == arr.size, "Invalid parameters size for FusedRNNCell" - return args - - def unpack_weights(self, args): - args = args.copy() - arr = args.pop(self._parameter.name) - b = len(self._directions) - m = self._num_gates - h = self._num_hidden - num_input = arr.size//b//h//m - (self._num_layers - 1)*(h+b*h+2) - h - 2 - - nargs = self._slice_weights(arr, num_input, self._num_hidden) - args.update({name: nd.copy() for name, nd in nargs.items()}) - return args - - def pack_weights(self, args): - args = args.copy() - b = self._bidirectional + 1 - m = self._num_gates - c = self._gate_names - h = self._num_hidden - w0 = args['%sl0_i2h%s_weight'%(self._prefix, c[0])] - num_input = w0.shape[1] - total = (num_input+h+2)*h*m*b + (self._num_layers-1)*m*h*(h+b*h+2)*b - - arr = ndarray.zeros((total,), ctx=w0.context, dtype=w0.dtype) - for name, nd in self._slice_weights(arr, num_input, h).items(): - nd[:] = args.pop(name) - args[self._parameter.name] = arr - return args - - def __call__(self, inputs, states): - raise NotImplementedError("FusedRNNCell cannot be stepped. Please use unroll") - - def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None): - self.reset() - - inputs, axis = _normalize_sequence(length, inputs, layout, True) - if axis == 1: - warnings.warn("NTC layout detected. Consider using " - "TNC for FusedRNNCell for faster speed") - inputs = symbol.swapaxes(inputs, dim1=0, dim2=1) - else: - assert axis == 0, "Unsupported layout %s"%layout - if begin_state is None: - begin_state = self.begin_state() - - states = begin_state - if self._mode == 'lstm': - states = {'state': states[0], 'state_cell': states[1]} # pylint: disable=redefined-variable-type - else: - states = {'state': states[0]} - - rnn = symbol.RNN(data=inputs, parameters=self._parameter, - state_size=self._num_hidden, num_layers=self._num_layers, - bidirectional=self._bidirectional, p=self._dropout, - state_outputs=self._get_next_state, - mode=self._mode, name=self._prefix+'rnn', - **states) - - if not self._get_next_state: - outputs, states = rnn, [] - elif self._mode == 'lstm': - outputs, states = rnn[0], [rnn[1], rnn[2]] - else: - outputs, states = rnn[0], [rnn[1]] - - if axis == 1: - outputs = symbol.swapaxes(outputs, dim1=0, dim2=1) - - outputs, _ = _normalize_sequence(length, outputs, layout, merge_outputs) - - return outputs, states - - def unfuse(self): - """Unfuse the fused RNN in to a stack of rnn cells. - - Returns - ------- - cell : SequentialRNNCell - unfused cell that can be used for stepping, and can run on CPU. - """ - stack = SequentialRNNCell() - get_cell = {'rnn_relu': lambda cell_prefix: RNNCell(self._num_hidden, - activation='relu', - prefix=cell_prefix), - 'rnn_tanh': lambda cell_prefix: RNNCell(self._num_hidden, - activation='tanh', - prefix=cell_prefix), - 'lstm': lambda cell_prefix: LSTMCell(self._num_hidden, - prefix=cell_prefix), - 'gru': lambda cell_prefix: GRUCell(self._num_hidden, - prefix=cell_prefix)}[self._mode] - for i in range(self._num_layers): - if self._bidirectional: - stack.add(BidirectionalCell( - get_cell('%sl%d_'%(self._prefix, i)), - get_cell('%sr%d_'%(self._prefix, i)), - output_prefix='%sbi_l%d_'%(self._prefix, i))) - else: - stack.add(get_cell('%sl%d_'%(self._prefix, i))) - - if self._dropout > 0 and i != self._num_layers - 1: - stack.add(DropoutCell(self._dropout, prefix='%s_dropout%d_'%(self._prefix, i))) - - return stack - - -class SequentialRNNCell(BaseRNNCell): - """Sequantially stacking multiple RNN cells - - Parameters - ---------- - params : RNNParams or None - container for weight sharing between cells. - created if None. - """ - def __init__(self, params=None): - super(SequentialRNNCell, self).__init__(prefix='', params=params) - self._override_cell_params = params is not None - self._cells = [] - - def add(self, cell): - """Append a cell into the stack. - - Parameters - ---------- - cell : rnn cell - """ - self._cells.append(cell) - if self._override_cell_params: - assert cell._own_params, \ - "Either specify params for SequentialRNNCell " \ - "or child cells, not both." - cell.params._params.update(self.params._params) - self.params._params.update(cell.params._params) - - @property - def state_shape(self): - return _cells_state_shape(self._cells) - - def begin_state(self, **kwargs): - assert not self._modified, \ - "After applying modifier cells (e.g. ZoneoutCell) the base " \ - "cell cannot be called directly. Call the modifier cell instead." - return _cells_begin_state(self._cells, **kwargs) - - def unpack_weights(self, args): - return _cells_unpack_weights(self._cells, args) - - def pack_weights(self, args): - return _cells_pack_weights(self._cells, args) - - def __call__(self, inputs, states): - self._counter += 1 - next_states = [] - p = 0 - for cell in self._cells: - assert not isinstance(cell, BidirectionalCell) - n = len(cell.state_shape) - state = states[p:p+n] - p += n - inputs, state = cell(inputs, state) - next_states.append(state) - return inputs, sum(next_states, []) - - def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None): - self.reset() - - num_cells = len(self._cells) - if begin_state is None: - begin_state = self.begin_state() - - p = 0 - next_states = [] - for i, cell in enumerate(self._cells): - n = len(cell.state_shape) - states = begin_state[p:p+n] - p += n - inputs, states = cell.unroll(length, inputs=inputs, begin_state=states, layout=layout, - merge_outputs=None if i < num_cells-1 else merge_outputs) - next_states.extend(states) - - return inputs, next_states - - -class DropoutCell(BaseRNNCell): - """Apply dropout on input. - - Parameters - ---------- - dropout : float - percentage of elements to drop out, which - is 1 - percentage to retain. - """ - def __init__(self, dropout, prefix='dropout_', params=None): - super(DropoutCell, self).__init__(prefix, params) - assert isinstance(dropout, numeric_types), "dropout probability must be a number" - self.dropout = dropout - - @property - def state_shape(self): - return [] - - def __call__(self, inputs, states): - if self.dropout > 0: - inputs = symbol.Dropout(data=inputs, p=self.dropout) - return inputs, states - - def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None): - self.reset() - inputs, _ = _normalize_sequence(length, inputs, layout, merge_outputs) - if isinstance(inputs, symbol.Symbol): - return self(inputs, []) - else: - return super(DropoutCell, self).unroll( - length, inputs, begin_state=begin_state, layout=layout, - merge_outputs=merge_outputs) - - -class ModifierCell(BaseRNNCell): - """Base class for modifier cells. A modifier - cell takes a base cell, apply modifications - on it (e.g. Zoneout), and returns a new cell. - - After applying modifiers the base cell should - no longer be called directly. The modifer cell - should be used instead. - """ - def __init__(self, base_cell): - super(ModifierCell, self).__init__() - base_cell._modified = True - self.base_cell = base_cell - - @property - def params(self): - self._own_params = False - return self.base_cell.params - - @property - def state_shape(self): - return self.base_cell.state_shape - - def begin_state(self, init_sym=symbol.zeros, **kwargs): - assert not self._modified, \ - "After applying modifier cells (e.g. DropoutCell) the base " \ - "cell cannot be called directly. Call the modifier cell instead." - self.base_cell._modified = False - begin = self.base_cell.begin_state(init_sym, **kwargs) - self.base_cell._modified = True - return begin - - def unpack_weights(self, args): - return self.base_cell.unpack_weights(args) - - def pack_weights(self, args): - return self.base_cell.pack_weights(args) - - def __call__(self, inputs, states): - raise NotImplementedError - - -class ZoneoutCell(ModifierCell): - """Apply Zoneout on base cell""" - def __init__(self, base_cell, zoneout_outputs=0., zoneout_states=0.): - assert not isinstance(base_cell, FusedRNNCell), \ - "FusedRNNCell doesn't support zoneout. " \ - "Please unfuse first." - assert not isinstance(base_cell, BidirectionalCell), \ - "BidirectionalCell doesn't support zoneout since it doesn't support step. " \ - "Please add ZoneoutCell to the cells underneath instead." - assert not isinstance(base_cell, SequentialRNNCell) or not base_cell._bidirectional, \ - "Bidirectional SequentialRNNCell doesn't support zoneout. " \ - "Please add ZoneoutCell to the cells underneath instead." - super(ZoneoutCell, self).__init__(base_cell) - self.zoneout_outputs = zoneout_outputs - self.zoneout_states = zoneout_states - self.prev_output = None - - def reset(self): - super(ZoneoutCell, self).reset() - self.prev_output = None - - def __call__(self, inputs, states): - cell, p_outputs, p_states = self.base_cell, self.zoneout_outputs, self.zoneout_states - next_output, next_states = cell(inputs, states) - mask = (lambda p, like: - symbol.Dropout(_symbol_internal._identity_with_attr_like_rhs(symbol.ones((0, 0)), - like), - p=p)) - - prev_output = self.prev_output if self.prev_output else symbol.zeros((0, 0)) - - output = (symbol.where(mask(p_outputs, next_output), next_output, prev_output) - if p_outputs != 0. else next_output) - states = ([symbol.where(mask(p_states, new_s), new_s, old_s) for new_s, old_s in - zip(next_states, states)] if p_states != 0. else next_states) - - self.prev_output = output - - return output, states - - - -class BidirectionalCell(BaseRNNCell): - """Bidirectional RNN cell - - Parameters - ---------- - l_cell : BaseRNNCell - cell for forward unrolling - r_cell : BaseRNNCell - cell for backward unrolling - output_prefix : str, default 'bi_' - prefix for name of output - """ - def __init__(self, l_cell, r_cell, params=None, output_prefix='bi_'): - super(BidirectionalCell, self).__init__('', params=params) - self._override_cell_params = params is not None - self._cells = [l_cell, r_cell] - self._output_prefix = output_prefix - - def unpack_weights(self, args): - return _cells_unpack_weights(self._cells, args) - - def pack_weights(self, args): - return _cells_pack_weights(self._cells, args) - - def __call__(self, inputs, states): - raise NotImplementedError("Bidirectional cannot be stepped. Please use unroll") - - @property - def state_shape(self): - return _cells_state_shape(self._cells) - - def begin_state(self, **kwargs): - assert not self._modified, \ - "After applying modifier cells (e.g. DropoutCell) the base " \ - "cell cannot be called directly. Call the modifier cell instead." - return _cells_begin_state(self._cells, **kwargs) - - def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None): - self.reset() - - inputs, axis = _normalize_sequence(length, inputs, layout, False) - if begin_state is None: - begin_state = self.begin_state() - - states = begin_state - l_cell, r_cell = self._cells - l_outputs, l_states = l_cell.unroll(length, inputs=inputs, - begin_state=states[:len(l_cell.state_shape)], - layout=layout, merge_outputs=merge_outputs) - r_outputs, r_states = r_cell.unroll(length, - inputs=list(reversed(inputs)), - begin_state=states[len(l_cell.state_shape):], - layout=layout, merge_outputs=merge_outputs) - - if merge_outputs is None: - merge_outputs = (isinstance(l_outputs, symbol.Symbol) - and isinstance(r_outputs, symbol.Symbol)) - if not merge_outputs: - if isinstance(l_outputs, symbol.Symbol): - l_outputs = list(symbol.SliceChannel(l_outputs, axis=axis, - num_outputs=length, squeeze_axis=1)) - if isinstance(r_outputs, symbol.Symbol): - r_outputs = list(symbol.SliceChannel(r_outputs, axis=axis, - num_outputs=length, squeeze_axis=1)) - - if merge_outputs: - l_outputs = [l_outputs] - r_outputs = [symbol.reverse(r_outputs, axis=axis)] - else: - r_outputs = list(reversed(r_outputs)) - - outputs = [symbol.Concat(l_o, r_o, dim=1+merge_outputs, - name=('%sout'%(self._output_prefix) if merge_outputs - else '%st%d'%(self._output_prefix, i))) - for i, l_o, r_o in - zip(range(len(l_outputs)), l_outputs, r_outputs)] - - if merge_outputs: - outputs = outputs[0] - - states = [l_states, r_states] - return outputs, states diff --git a/seq2seq/seq2seq_bucketing.py b/seq2seq/seq2seq_bucketing.py deleted file mode 100755 index ef7b471..0000000 --- a/seq2seq/seq2seq_bucketing.py +++ /dev/null @@ -1,498 +0,0 @@ -import numpy as np -import mxnet as mx -import argparse -import cPickle as pickle -#import dill as pickle -import math -import nltk - -from mxnet.rnn import LSTMCell, SequentialRNNCell, FusedRNNCell -#from rnn_cell import LSTMCell, SequentialRNNCell -from itertools import takewhile, dropwhile - -from time import time -import re -from unidecode import unidecode - -from utils import array_to_text, tokenize_text, invert_dict, get_s2s_data, Dataset - -from seq2seq_iterator import * - -from attention_cell import AttentionEncoderCell, DotAttentionCell - -parser = argparse.ArgumentParser(description="Train RNN on Penn Tree Bank", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--infer', default=False, action='store_true', - help='whether to do inference instead of training') -parser.add_argument('--model-prefix', type=str, default=None, - help='path to save/load model') -parser.add_argument('--load-epoch', type=int, default=0, - help='load from epoch') -parser.add_argument('--num-layers', type=int, default=2, - help='number of stacked RNN layers') -parser.add_argument('--num-hidden', type=int, default=200, - help='hidden layer size') -parser.add_argument('--num-embed', type=int, default=200, - help='embedding layer size') -parser.add_argument('--bidirectional', type=bool, default=False, - help='whether to use bidirectional layers') -parser.add_argument('--gpus', type=str, - help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \ - 'Increase batch size when using multiple gpus for best performance.') -parser.add_argument('--kv-store', type=str, default='device', - help='key-value store type') -parser.add_argument('--num-epochs', type=int, default=25, - help='max num of epochs') -parser.add_argument('--lr', type=float, default=0.01, - help='initial learning rate') -parser.add_argument('--optimizer', type=str, default='sgd', - help='the optimizer type') -parser.add_argument('--mom', type=float, default=0.0, - help='momentum for sgd') -parser.add_argument('--wd', type=float, default=0.00001, - help='weight decay for sgd') -parser.add_argument('--batch-size', type=int, default=32, - help='the batch size.') -parser.add_argument('--disp-batches', type=int, default=50, - help='show progress for every n batches') -parser.add_argument('--max-grad-norm', type=float, default=5.0, - help='maximum gradient norm (larger values will be clipped') -# When training a deep, complex model, it's recommended to stack fused RNN cells (one -# layer per cell) together instead of one with all layers. The reason is that fused RNN -# cells doesn't set gradients to be ready until the computation for the entire layer is -# completed. Breaking a multi-layer fused RNN cell into several one-layer ones allows -# gradients to be processed ealier. This reduces communication overhead, especially with -# multiple GPUs. -parser.add_argument('--stack-rnn', default=False, - help='stack fused RNN cells to reduce communication overhead') -parser.add_argument('--dropout', type=float, default='0.0', - help='dropout probability (1.0 - keep probability)') -parser.add_argument('--use-cudnn-cells', action='store_true', - help='Use CUDNN LSTM (mx.rnn.FusedRNNCell) for training instead of in-graph LSTM cells (mx.rnn.LSTMCell)') - -#buckets = [32] -# buckets = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] - -start_label = 1 -invalid_label = 0 - -reserved_tokens={'':0, '':1, '':2, '':3} - -def print_inferred_shapes(node, arg_shapes, aux_shapes, out_shapes): - args = node.list_arguments() - aux_states = node.list_auxiliary_states() - outputs = node.list_outputs() - print("\n================================================") - print("\nNODE: %s" % node.name) - print("\n============") - print("args:") - print("============") - if len(arg_shapes) == 0: - print("N/A") - for i in range(len(arg_shapes)): - print("%s: %s" % (args[i], arg_shapes[i])) - print("\n=============") - print("aux_states:") - print("=============") - if len(aux_shapes) == 0: - print("N/A") - for i in range(len(aux_states)): - print("%s: %s" % (aux_states[i], aux_shapes[i])) - print("\n=============") - print("outputs:") - print("==============") - if len(out_shapes) == 0: - print("N/A") - for i in range(len(outputs)): - print("%s: %s" % (outputs[i], out_shapes[i])) - print("\n================================================") - print("\n") - -def _normalize_sequence(length, inputs, layout, merge, in_layout=None): - from mxnet import symbol, init, ndarray, _symbol_internal - - assert inputs is not None, \ - "unroll(inputs=None) has been deprecated. " \ - "Please create input variables outside unroll." - - axis = layout.find('T') - in_axis = in_layout.find('T') if in_layout is not None else axis - if isinstance(inputs, symbol.Symbol): - if merge is False: - assert len(inputs.list_outputs()) == 1, \ - "unroll doesn't allow grouped symbol as input. Please convert " \ - "to list with list(inputs) first or let unroll handle splitting." - inputs = list(symbol.split(inputs, axis=in_axis, num_outputs=length, - squeeze_axis=1)) - else: - assert length is None or len(inputs) == length - if merge is True: - inputs = [symbol.expand_dims(i, axis=axis) for i in inputs] - inputs = symbol.Concat(*inputs, dim=axis) - in_axis = axis - - if isinstance(inputs, symbol.Symbol) and axis != in_axis: - inputs = symbol.swapaxes(inputs, dim0=axis, dim1=in_axis) - - return inputs, axis - -def get_data(layout): - - start = time() - - print("\nUnpickling training iterator") - - with open('./data/train_iterator.pkl', 'rb') as f: # _en_de.pkl - train_iter = pickle.load(f) - - train_iter.initialize() - train_iter.batch_size = args.batch_size - - print("\nUnpickling validation iterator") - - with open('./data/valid_iterator.pkl', 'rb') as f: # _en_de.pkl - valid_iter = pickle.load(f) - - valid_iter.initialize() - valid_iter.batch_size = args.batch_size - - print("\nEncoded source language sentences:\n") - for i in range(5): - print(array_to_text(train_iter.src_sent[i], train_iter.inv_src_vocab)) - - print("\nEncoded target language sentences:\n") - for i in range(5): - print(array_to_text(valid_iter.targ_sent[i], train_iter.inv_targ_vocab)) - - duration = time() - start - - print("\nDataset deserialization time: %.2f seconds\n" % duration) - - return train_iter, valid_iter, train_iter.src_vocab, train_iter.targ_vocab - -# WORK IN PROGRESS !!! -def decoder_unroll(decoder, target_embed, targ_vocab, unroll_length, go_symbol, begin_state=None, layout='TNC', merge_outputs=None): - - decoder.reset() - - if begin_state is None: - begin_state = decoder.begin_state() - - inputs, _ = _normalize_sequence(unroll_length, target_embed, layout, False) - - # Need to use hidden state from attention model, but as input - states = begin_state - outputs = [] - - embed = inputs[0] - - # NEW 1 -# fc_weight = mx.sym.Variable('fc_weight') -# fc_bias = mx.sym.Variable('fc_bias') -# em_weight = mx.sym.Variable('em_weight') -# for i in range(0, unroll_length): -# output, states = decoder(embed, states) -# outputs.append(embed) -# fc = mx.sym.FullyConnected(data=output, weight=fc_weight, bias=fc_bias, num_hidden=len(targ_vocab), name='decoder_fc%d_'%i) -# am = mx.sym.argmax(data=fc, axis=1) -# embed = mx.sym.Embedding(data=am, weight=em_weight, input_dim=len(targ_vocab), -# output_dim=args.num_embed, name='decoder_embed%d_'%i) - - # NEW 2 - for i in range(0, unroll_length): - embed, states = decoder(embed, states) - outputs.append(embed) - - outputs, _ = _normalize_sequence(unroll_length, outputs, layout, merge_outputs) - - return outputs, states - -def train(args): - - from time import time - - data_train, data_val, src_vocab, targ_vocab = get_data('TN') - print "len(src_vocab) len(targ_vocab)", len(src_vocab), len(targ_vocab) - - encoder = SequentialRNNCell() - - if args.use_cudnn_cells: - encoder.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout, - mode='lstm', prefix='lstm_encoder', bidirectional=args.bidirectional, get_next_state=True)) - else: - for i in range(args.num_layers): - encoder.add(LSTMCell(args.num_hidden, prefix='rnn_encoder%d_' % i)) - if i < args.num_layers - 1 and args.dropout > 0.0: - encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i)) - encoder.add(AttentionEncoderCell()) - - decoder = mx.rnn.SequentialRNNCell() - - if args.use_cudnn_cells: - decoder.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, - mode='lstm', prefix='lstm_decoder', bidirectional=args.bidirectional, get_next_state=True)) - else: - for i in range(args.num_layers): - decoder.add(LSTMCell(args.num_hidden, prefix=('rnn_decoder%d_' % i))) - if i < args.num_layers - 1 and args.dropout > 0.0: - decoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_decoder%d_' % i)) - decoder.add(DotAttentionCell()) - - def sym_gen(seq_len): - src_data = mx.sym.Variable('src_data') - targ_data = mx.sym.Variable('targ_data') - label = mx.sym.Variable('softmax_label') - - src_embed = mx.sym.Embedding(data=src_data, input_dim=len(src_vocab), - output_dim=args.num_embed, name='src_embed') - targ_embed = mx.sym.Embedding(data=targ_data, input_dim=len(targ_vocab), # data=data - output_dim=args.num_embed, name='targ_embed') - - encoder.reset() - decoder.reset() - - enc_seq_len, dec_seq_len = seq_len - - layout = 'TNC' - _, states = encoder.unroll(enc_seq_len, inputs=src_embed, layout=layout) - - # This should be based on EOS or max seq len for inference, but here we unroll to the target length - # TODO: fix symbol - outputs, _ = decoder.unroll(dec_seq_len, targ_embed, begin_state=states, layout=layout, merge_outputs=True) -# outputs, _ = decoder_unroll(decoder, targ_embed, targ_vocab, dec_seq_len, 0, begin_state=states, layout='TNC', merge_outputs=True) - - # NEW - rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1') - fc = mx.sym.FullyConnected(data=rs, num_hidden=len(targ_vocab), name='sym_gen_fc') - label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2') - pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax') - - return pred, ('src_data', 'targ_data',), ('softmax_label',) - - -# foo, _, _ = sym_gen((1, 1)) -# print(type(foo)) -# mx.viz.plot_network(symbol=foo).save('./seq2seq.dot') - - - if args.gpus: - contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] - else: - contexts = mx.cpu(0) - - model = mx.mod.BucketingModule( - sym_gen = sym_gen, - default_bucket_key = data_train.default_bucket_key, - context = contexts) - - if args.load_epoch: - _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint( - cell, args.model_prefix, args.load_epoch) - else: - arg_params = None - aux_params = None - - opt_params = { - 'learning_rate': args.lr, - 'wd': args.wd - } - - if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']: - opt_params['momentum'] = args.mom - - opt_params['clip_gradient'] = args.max_grad_norm - - start = time() - - model.fit( - train_data = data_train, - eval_data = data_val, - eval_metric = mx.metric.Perplexity(invalid_label), - kvstore = args.kv_store, - optimizer = args.optimizer, - optimizer_params = opt_params, - initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), - arg_params = arg_params, - aux_params = aux_params, - begin_epoch = args.load_epoch, - num_epoch = args.num_epochs, - batch_end_callback = mx.callback.Speedometer(batch_size=args.batch_size, frequent=args.disp_batches, auto_reset=True), - epoch_end_callback = mx.rnn.do_rnn_checkpoint(decoder, args.model_prefix, 1) - if args.model_prefix else None) - - train_duration = time() - start - time_per_epoch = train_duration / args.num_epochs - print("\n\nTime per epoch: %.2f seconds\n\n" % time_per_epoch) - -class BleuScore(mx.metric.EvalMetric): - def __init__(self, ignore_label, axis=-1): - super(BleuScore, self).__init__('BleuScore') - self.ignore_label = ignore_label - self.axis = axis - - def update(self, labels, preds): - assert len(labels) == len(preds) - - def drop_sentinels(text_lst): - sentinels = lambda x: x == reserved_tokens[''] or x == reserved_tokens[''] - text_lst = dropwhile(lambda x: sentinels(x), text_lst) - text_lst = takewhile(lambda x: not sentinels(x) and x != reserved_tokens[''], text_lst) - return list(text_lst) - - smoothing_fn = nltk.translate.bleu_score.SmoothingFunction().method3 - - for label, pred in zip(labels, preds): - maxed = mx.ndarray.argmax(data=pred, axis=1) - pred_nparr = maxed.asnumpy() - label_nparr = label.asnumpy().astype(np.int32) - sent_len, batch_size = np.shape(label_nparr) - pred_nparr = pred_nparr.reshape(sent_len, batch_size).astype(np.int32) - - for i in range(batch_size): - exp_lst = drop_sentinels(label_nparr[:, i].tolist()) - act_lst = drop_sentinels(pred_nparr[:, i].tolist()) - expected = exp_lst - actual = act_lst - bleu = nltk.translate.bleu_score.sentence_bleu( - references=[expected], hypothesis=actual, weights=(0.25, 0.25, 0.25, 0.25), - smoothing_function = smoothing_fn - ) -# print("bleu: %f" % bleu) - self.sum_metric += bleu - self.num_inst += 1 - assert label.size == pred.size/pred.shape[-1], \ - "shape mismatch: %s vs. %s"%(label.shape, pred.shape) - - def get(self): - num = self.num_inst if self.num_inst > 0 else float('nan') - return (self.name, self.sum_metric/num) - - -def infer(args): - assert args.model_prefix, "Must specifiy path to load from" - - data_train, data_val, src_vocab, targ_vocab = get_data('TN') - - print "len(src_vocab) len(targ_vocab)", len(src_vocab), len(targ_vocab) - - if args.use_cudnn_cells: - encoder = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout, - mode='lstm', prefix='lstm_encoder', bidirectional=args.bidirectional, get_next_state=True).unfuse() - - else: - encoder = SequentialRNNCell() - - for i in range(args.num_layers): - encoder.add(LSTMCell(args.num_hidden, prefix='rnn_encoder%d_' % i)) - if i < args.num_layers - 1 and args.dropout > 0.0: - encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i)) - - encoder.add(AttentionEncoderCell()) - - if args.use_cudnn_cells: - decoder = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, - mode='lstm', prefix='lstm_decoder', bidirectional=args.bidirectional, get_next_state=True).unfuse() - - else: - decoder = mx.rnn.SequentialRNNCell() - - for i in range(args.num_layers): - decoder.add(LSTMCell(args.num_hidden, prefix=('rnn_decoder%d_' % i))) - if i < args.num_layers - 1 and args.dropout > 0.0: - decoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_decoder%d_' % i)) - - decoder.add(DotAttentionCell()) - - def sym_gen(seq_len): - src_data = mx.sym.Variable('src_data') - targ_data = mx.sym.Variable('targ_data') - label = mx.sym.Variable('softmax_label') - - src_embed = mx.sym.Embedding(data=src_data, input_dim=len(src_vocab), - output_dim=args.num_embed, name='src_embed') - targ_embed = mx.sym.Embedding(data=targ_data, input_dim=len(targ_vocab), # data=data - output_dim=args.num_embed, name='targ_embed') - - encoder.reset() - decoder.reset() - - enc_seq_len, dec_seq_len = seq_len - - layout = 'TNC' - _, states = encoder.unroll(enc_seq_len, inputs=src_embed, layout=layout) - - # This should be based on EOS or max seq len for inference, but here we unroll to the target length - # TODO: fix symbol -# outputs, _ = decoder.unroll(dec_seq_len, targ_embed, begin_state=states, layout=layout, merge_outputs=True) - outputs, _ = decoder_unroll(decoder, targ_embed, targ_vocab, dec_seq_len, 0, begin_state=states, layout='TNC', merge_outputs=True) - - # NEW - rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1') - fc = mx.sym.FullyConnected(data=rs, num_hidden=len(targ_vocab), name='sym_gen_fc') - label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2') - pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax') - - return pred, ('src_data', 'targ_data',), ('softmax_label',) - - if args.gpus: - contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] - else: - contexts = mx.cpu(0) - - model = mx.mod.BucketingModule( - sym_gen = sym_gen, - default_bucket_key = data_train.default_bucket_key, - context = contexts) - - model.bind(data_val.provide_data, data_val.provide_label, for_training=False) - - if args.load_epoch: - _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint( - decoder, args.model_prefix, args.load_epoch) - model.set_params(arg_params, aux_params) - - else: - arg_params = None - aux_params = None - - opt_params = { - 'learning_rate': args.lr, - 'wd': args.wd - } - - if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']: - opt_params['momentum'] = args.mom - - opt_params['clip_gradient'] = args.max_grad_norm - - start = time() - - # mx.metric.Perplexity - model.score(data_val, BleuScore(invalid_label), #PPL(invalid_label), - batch_end_callback=mx.callback.Speedometer(batch_size=args.batch_size, frequent=5, auto_reset=True)) - - infer_duration = time() - start - time_per_epoch = infer_duration / args.num_epochs - print("\n\nTime per epoch: %.2f seconds\n\n" % time_per_epoch) - -if __name__ == '__main__': - import logging - head = '%(asctime)-15s %(message)s' - logging.basicConfig(level=logging.DEBUG, format=head) - - args = parser.parse_args() - if args.gpus: - contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] - else: - contexts = mx.cpu(0) - - - if args.num_layers >= 4 and len(args.gpus.split(',')) >= 4 and not args.stack_rnn: - print('WARNING: stack-rnn is recommended to train complex model on multiple GPUs') - - if args.infer: - # Demonstrates how to load a model trained with CuDNN RNN and predict - # with non-fused MXNet symbol - infer(args) - else: - train(args) diff --git a/seq2seq_bucketing.py b/seq2seq_bucketing.py new file mode 100755 index 0000000..e7bbd1a --- /dev/null +++ b/seq2seq_bucketing.py @@ -0,0 +1,737 @@ +import numpy as np +import mxnet as mx +import argparse +import cPickle as pickle +#import dill as pickle +import math +import nltk + +from mxnet.rnn import LSTMCell, SequentialRNNCell, FusedRNNCell, BidirectionalCell +#from rnn_cell import LSTMCell, SequentialRNNCell +from itertools import takewhile, dropwhile +from operator import itemgetter + +from time import time +import re +from unidecode import unidecode + +from utils import array_to_text, tokenize_text, invert_dict, get_s2s_data, Dataset + +from seq2seq_iterator import * + +# from attention_cell import AttentionEncoderCell, DotAttentionCell + +parser = argparse.ArgumentParser(description="Train RNN on Penn Tree Bank", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--infer', default=False, action='store_true', + help='whether to do inference instead of training') +parser.add_argument('--model-prefix', type=str, default=None, + help='path to save/load model') +parser.add_argument('--load-epoch', type=int, default=0, + help='load from epoch') +parser.add_argument('--num-layers', type=int, default=2, + help='number of stacked RNN layers') +parser.add_argument('--num-hidden', type=int, default=200, + help='hidden layer size') +parser.add_argument('--num-embed', type=int, default=200, + help='embedding layer size') +parser.add_argument('--bidirectional', action='store_true', + help='whether to use bidirectional layers') +parser.add_argument('--gpus', type=str, + help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \ + 'Increase batch size when using multiple gpus for best performance.') +parser.add_argument('--kv-store', type=str, default='device', + help='key-value store type') +parser.add_argument('--num-epochs', type=int, default=25, + help='max num of epochs') +parser.add_argument('--lr', type=float, default=0.01, + help='initial learning rate') +parser.add_argument('--optimizer', type=str, default='sgd', + help='the optimizer type') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--wd', type=float, default=0.00001, + help='weight decay for sgd') +parser.add_argument('--batch-size', type=int, default=32, + help='the batch size.') +parser.add_argument('--disp-batches', type=int, default=50, + help='show progress for every n batches') +parser.add_argument('--max-grad-norm', type=float, default=5.0, + help='maximum gradient norm (larger values will be clipped') + +# When training a deep, complex model, it's recommended to stack fused RNN cells (one +# layer per cell) together instead of one with all layers. The reason is that fused RNN +# cells doesn't set gradients to be ready until the computation for the entire layer is +# completed. Breaking a multi-layer fused RNN cell into several one-layer ones allows +# gradients to be processed ealier. This reduces communication overhead, especially with +# multiple GPUs. +parser.add_argument('--stack-rnn', default=False, + help='stack fused RNN cells to reduce communication overhead') +parser.add_argument('--dropout', type=float, default='0.0', + help='dropout probability (1.0 - keep probability)') +parser.add_argument('--use-cudnn-cells', action='store_true', + help='Use CUDNN LSTM (mx.rnn.FusedRNNCell) for training instead of in-graph LSTM cells (mx.rnn.LSTMCell)') + +parser.add_argument('--inference-unrolling-for-training', action='store_true', + help='Feed previous prediction (instead of previous ground truth) into the decoder input during training') +parser.add_argument('--seed', type=int, default=1234, + help='Set random seed for Python, NumPy and MxNet RNGs') + +parser.add_argument('--remove-state-feed', action='store_true', + help='Remove direct state feeding from encoder to decoder (use when using attention)') + + +parser.add_argument('--input-feed', action='store_true', + help='Enable input feed (attention is fed into the decoder as input, rather than concatenated with output)') + +parser.add_argument('--attention', action='store_true', + help='Use attention (dot attention is the currently implemented form') + +#buckets = [32] +# buckets = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] + +start_label = 1 +invalid_label = 0 + +reserved_tokens={'':0, '':1, '':2, '':3} + +def print_inferred_shapes(node, arg_shapes, aux_shapes, out_shapes): + args = node.list_arguments() + aux_states = node.list_auxiliary_states() + outputs = node.list_outputs() + print("\n================================================") + print("\nNODE: %s" % node.name) + print("\n============") + print("args:") + print("============") + if len(arg_shapes) == 0: + print("N/A") + for i in range(len(arg_shapes)): + print("%s: %s" % (args[i], arg_shapes[i])) + print("\n=============") + print("aux_states:") + print("=============") + if len(aux_shapes) == 0: + print("N/A") + for i in range(len(aux_states)): + print("%s: %s" % (aux_states[i], aux_shapes[i])) + print("\n=============") + print("outputs:") + print("==============") + if len(out_shapes) == 0: + print("N/A") + for i in range(len(outputs)): + print("%s: %s" % (outputs[i], out_shapes[i])) + print("\n================================================") + print("\n") + +def _normalize_sequence(length, inputs, layout, merge, in_layout=None): + from mxnet import symbol, init, ndarray, _symbol_internal + + assert inputs is not None, \ + "unroll(inputs=None) has been deprecated. " \ + "Please create input variables outside unroll." + + axis = layout.find('T') + in_axis = in_layout.find('T') if in_layout is not None else axis + if isinstance(inputs, symbol.Symbol): + if merge is False: + assert len(inputs.list_outputs()) == 1, \ + "unroll doesn't allow grouped symbol as input. Please convert " \ + "to list with list(inputs) first or let unroll handle splitting." + inputs = list(symbol.split(inputs, axis=in_axis, num_outputs=length, + squeeze_axis=1)) + else: + assert length is None or len(inputs) == length + if merge is True: + inputs = [symbol.expand_dims(i, axis=axis) for i in inputs] + inputs = symbol.Concat(*inputs, dim=axis) + in_axis = axis + + if isinstance(inputs, symbol.Symbol) and axis != in_axis: + inputs = symbol.swapaxes(inputs, dim0=axis, dim1=in_axis) + + return inputs, axis + +def get_data(layout, infer=False): + + start = time() + + print("\nUnpickling training iterator") + + if not infer: + with open('./data/train_iterator.pkl', 'rb') as f: # _en_de.pkl + train_iter = pickle.load(f) + + train_iter.initialize(curr_batch_size=args.batch_size) + + print("\nUnpickling validation iterator") + + with open('./data/valid_iterator.pkl', 'rb') as f: # _en_de.pkl + valid_iter = pickle.load(f) + + valid_iter.initialize(curr_batch_size=args.batch_size) + + with open('./data/test_iterator.pkl', 'rb') as f: + test_iter = pickle.load(f) + + test_iter.initialize(curr_batch_size=args.batch_size) + +# print("\nEncoded source language sentences:\n") +# for i in range(5): +# print(array_to_text(train_iter.src_sent[i], train_iter.inv_src_vocab)) + +# print("\nEncoded target language sentences:\n") +# for i in range(5): +# print(array_to_text(train_iter.targ_sent[i], train_iter.inv_targ_vocab)) + + duration = time() - start + + print("\nDataset deserialization time: %.2f seconds\n" % duration) + + if not infer: + return train_iter, valid_iter, test_iter, train_iter.src_vocab, train_iter.targ_vocab, train_iter.inv_src_vocab, train_iter.inv_targ_vocab + else: + return test_iter, test_iter.src_vocab, test_iter.inv_src_vocab, test_iter.targ_vocab, test_iter.inv_targ_vocab + +def attention_step(i, encoder_outputs, decoder_output): + + attention_state = mx.sym.zeros_like(encoder_outputs[-1], name='train_dec_unroll_attention_state') + curr_att_input = mx.sym.expand_dims(decoder_output, axis=2, name='train_dec_unroll_expand_dims_%d_' % i) + enc_len = len(encoder_outputs) + dots = [] + concat_dots = None + # loop over all the encoder periods to create weights for weighted state + for j in range(enc_len): + transposed = mx.sym.expand_dims(encoder_outputs[j], axis=2) + transposed = mx.sym.transpose(transposed, axes=(0, 2, 1), name='train_decoder_transpose%d_' % i) + dot = mx.sym.batch_dot(transposed, curr_att_input, name='train_decoder_batch_dot_%d_%d_' % (i, j)) + dot = mx.sym.exp(dot, name='train_decoder_exp_%d_%d' % (i, j)) + # The batch size shouldn't be an arg here anyway. We should just remove extra dimensions + # and then transpose. + dot = mx.sym.reshape(dot, shape=(1, args.batch_size / len(contexts)), + name='train_decoder_unroll_reshape_%d_%d' % (i, j)) + dots.append(dot) + if not concat_dots: + concat_dots = dot + else: + concat_dots = mx.sym.concat(concat_dots, dot) + dot_sum = mx.sym.sum(concat_dots, axis=1) + for j in range(enc_len): + curr_dot = mx.sym.transpose(dots[j]) + attention_state += mx.sym.broadcast_mul(curr_dot, encoder_outputs[j], + name='train_encoder_acc_attention_%d_%d_' % (i, j)) + + attention_state = mx.sym.broadcast_div(attention_state, dot_sum) + + return attention_state + + +def train_decoder_unroll(decoder, encoder_outputs, target_embed, targ_vocab, unroll_length, + go_symbol, fc_weight, fc_bias, attention_fc_weight, attention_fc_bias, targ_em_weight, + begin_state=None, layout='TNC', merge_outputs=None): + decoder.reset() + if begin_state is None: + begin_state = decoder.begin_state() + inputs, _ = _normalize_sequence(unroll_length, target_embed, layout, False) + # Need to use hidden state from attention model, but as input + states = begin_state + outputs = [] + + #At the first time step there is no previous attention + attention_state = None + dec_out = None + + for i in range(unroll_length): + if args.input_feed: + # Copy previous attention output to concatenate with the embedding input + prev_attention_state = attention_state if attention_state else mx.sym.zeros_like(encoder_outputs[-1], name='train_dec_unroll_prev_attention_state') + decoder_feed = mx.sym.concat(inputs[i], prev_attention_state, name = 'decoder_feed_concat_%d_' % i) + else: + decoder_feed = inputs[i] + + prev_dec_out = dec_out if dec_out else mx.sym.zeros_like(encoder_outputs[-1], name='train_dec_unroll_prev_dec_out') # begin_state + dec_out, states = decoder(decoder_feed, states) + + if args.attention: + # The attention receives as input all the encoder outputs and the current decoder output and return the vector + # for this time step + attention_state = attention_step(i, encoder_outputs, prev_dec_out) + # The attention output is combined with the decoder output for computing the next word + concatenated = mx.sym.concat(dec_out, attention_state, name = 'train_decoder_concat_%d_' % i) + attention_fc = mx.sym.FullyConnected( + data=concatenated, weight=attention_fc_weight, bias=attention_fc_bias, num_hidden=args.num_hidden, name='attention_fc%d_' % i + ) + curr_out = mx.sym.Activation(data = attention_fc, act_type='tanh', name = 'attention_tanh%d_' % i) + else: + # We avoid all the attention computation + curr_out = dec_out + outputs.append(curr_out) + outputs, _ = _normalize_sequence(unroll_length, outputs, layout, merge_outputs) + return outputs, states + + +def infer_decoder_unroll(decoder, encoder_outputs, target_embed, targ_vocab, unroll_length, + go_symbol, fc_weight, fc_bias, attention_fc_weight, attention_fc_bias, targ_em_weight, + begin_state=None, layout='TNC', merge_outputs=None): + decoder.reset() + if begin_state is None: + begin_state = decoder.begin_state() + inputs, _ = _normalize_sequence(unroll_length, target_embed, layout, False) + # Need to use hidden state from attention model, but as input + states = begin_state + outputs = [] + embed = inputs[0] + + attention_state = None + + for i in range(unroll_length): + if args.input_feed: + # Copy previous attention output to concatenate with the embedding input + prev_attention_state = attention_state if attention_state else mx.sym.zeros_like(encoder_outputs[-1], + name='train_dec_unroll_prev_attention_state') + decoder_feed = mx.sym.concat(embed, prev_attention_state, name='decoder_feed_concat_%d_' % i) + else: + decoder_feed = embed + dec_out, states = decoder(decoder_feed, states) + + # Should this be dec_out or states as the first argument? + if args.attention: + attention_state = attention_step(i, encoder_outputs, dec_out) + concatenated = mx.sym.concat(dec_out, attention_state, name = 'train_decoder_concat_%d_' % i) + attention_fc = mx.sym.FullyConnected( + data=concatenated, weight=attention_fc_weight, bias=attention_fc_bias, num_hidden=args.num_hidden, name='attention_fc%d_' % i + ) + curr_out = mx.sym.Activation(data = attention_fc, act_type='tanh', name = 'attention_tanh%d_' % i) + else: + curr_out = dec_out + outputs.append(curr_out) + fc = mx.sym.FullyConnected(data=curr_out, weight=fc_weight, bias=fc_bias, num_hidden=len(targ_vocab), name='decoder_fc%d_'%i) + am = mx.sym.argmax(data=fc, axis=1) + embed = mx.sym.Embedding(data=am, weight=targ_em_weight, input_dim=len(targ_vocab), output_dim=args.num_embed, name='decoder_embed%d_'%i) + + outputs, _ = _normalize_sequence(unroll_length, outputs, layout, merge_outputs) + return outputs, states + +def train(args): + + from time import time + + data_train, data_val, _, src_vocab, targ_vocab, inv_src_vocab, inv_targ_vocab = get_data('TN') + print "len(src_vocab) len(targ_vocab)", len(src_vocab), len(targ_vocab) + + attention_fc_weight = mx.sym.Variable('attention_fc_weight') + attention_fc_bias = mx.sym.Variable('attention_fc_bias') + + fc_weight = mx.sym.Variable('fc_weight') + fc_bias = mx.sym.Variable('fc_bias') + targ_em_weight = mx.sym.Variable('targ_embed_weight') + + encoder = SequentialRNNCell() + + if args.use_cudnn_cells: + encoder.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout, + mode='lstm', prefix='lstm_encoder', bidirectional=args.bidirectional, get_next_state=True)) + else: + for i in range(args.num_layers): + if args.bidirectional: + encoder.add( + BidirectionalCell( + LSTMCell(args.num_hidden // 2, prefix='rnn_encoder_f%d_' % i), + LSTMCell(args.num_hidden // 2, prefix='rnn_encoder_b%d_' % i))) + if i < args.num_layers - 1 and args.dropout > 0.0: + encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i)) + else: + encoder.add( + LSTMCell(args.num_hidden, prefix='rnn_encoder%d_' % i)) + if i < args.num_layers - 1 and args.dropout > 0.0: + encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i)) + + decoder = mx.rnn.SequentialRNNCell() + + if args.use_cudnn_cells: + decoder.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, + mode='lstm', prefix='lstm_decoder', bidirectional=args.bidirectional, get_next_state=True)) + else: + for i in range(args.num_layers): + decoder.add(LSTMCell(args.num_hidden, prefix=('rnn_decoder%d_' % i))) + if i < args.num_layers - 1 and args.dropout > 0.0: + decoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_decoder%d_' % i)) + + def sym_gen(seq_len): + src_data = mx.sym.Variable('src_data') + targ_data = mx.sym.Variable('targ_data') + label = mx.sym.Variable('softmax_label') + + src_embed = mx.sym.Embedding(data=src_data, input_dim=len(src_vocab), + output_dim=args.num_embed, name='src_embed') + targ_embed = mx.sym.Embedding(data=targ_data, weight=targ_em_weight, input_dim=len(targ_vocab), # data=data + output_dim=args.num_embed, name='targ_embed') + + encoder.reset() + decoder.reset() + + enc_seq_len, dec_seq_len = seq_len + + layout = 'TNC' + encoder_outputs, encoder_states = encoder.unroll(enc_seq_len, inputs=src_embed, layout=layout) + + if args.bidirectional: + encoder_states = [mx.sym.concat(encoder_states[0][0], encoder_states[0][1]), + mx.sym.concat(encoder_states[0][1], encoder_states[1][1])] + + if args.remove_state_feed: + encoder_states = None + + # This should be based on EOS or max seq len for inference, but here we unroll to the target length + # TODO: fix symbol + if args.inference_unrolling_for_training: + outputs, _ = infer_decoder_unroll(decoder, encoder_outputs, targ_embed, targ_vocab, dec_seq_len, 0, fc_weight, fc_bias, + attention_fc_weight, attention_fc_bias, + targ_em_weight, begin_state=encoder_states, layout='TNC', merge_outputs=True) + else: + outputs, _ = train_decoder_unroll(decoder, encoder_outputs, targ_embed, targ_vocab, dec_seq_len, 0, fc_weight, fc_bias, + attention_fc_weight, attention_fc_bias, + targ_em_weight, begin_state=encoder_states, layout='TNC', merge_outputs=True) + + # NEW + rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1') + fc = mx.sym.FullyConnected(data=rs, weight=fc_weight, bias=fc_bias, num_hidden=len(targ_vocab), name='sym_gen_fc') + label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2') + pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax') + + return pred, ('src_data', 'targ_data',), ('softmax_label',) + + +# foo, _, _ = sym_gen((1, 1)) +# print(type(foo)) +# mx.viz.plot_network(symbol=foo).save('./seq2seq.dot') + + + if args.gpus: + contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] + else: + contexts = mx.cpu(0) + + model = mx.mod.BucketingModule( + sym_gen = sym_gen, + default_bucket_key = data_train.default_bucket_key, + context = contexts) + + if args.load_epoch: + _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint( + [encoder, decoder], args.model_prefix, args.load_epoch) + else: + arg_params = None + aux_params = None + + opt_params = { + 'learning_rate': args.lr, + 'wd': args.wd + } + + if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']: + opt_params['momentum'] = args.mom + + opt_params['clip_gradient'] = args.max_grad_norm + + start = time() + + model.fit( + train_data = data_train, + eval_data = data_val, + eval_metric = mx.metric.Perplexity(invalid_label), + kvstore = args.kv_store, + optimizer = args.optimizer, + optimizer_params = opt_params, + initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), + arg_params = arg_params, + aux_params = aux_params, + begin_epoch = args.load_epoch, + num_epoch = args.num_epochs, + batch_end_callback = mx.callback.Speedometer(batch_size=args.batch_size, frequent=args.disp_batches, auto_reset=True), + epoch_end_callback = mx.rnn.do_rnn_checkpoint([encoder, decoder], args.model_prefix, 1) + if args.model_prefix else None) + + train_duration = time() - start + time_per_epoch = train_duration / args.num_epochs + print("\n\nTime per epoch: %.2f seconds\n\n" % time_per_epoch) + + +def drop_sentinels(text_lst): + sentinels = lambda x: x == reserved_tokens[''] or x == reserved_tokens[''] + text_lst = dropwhile(lambda x: sentinels(x), text_lst) + text_lst = takewhile(lambda x: not sentinels(x) and x != reserved_tokens[''], text_lst) + return list(text_lst) + + +class BleuScore(mx.metric.EvalMetric): + def __init__(self, ignore_label, axis=-1): + super(BleuScore, self).__init__('BleuScore') + self.ignore_label = ignore_label + self.axis = axis + + def update(self, labels, preds): + assert len(labels) == len(preds) + + smoothing_fn = nltk.translate.bleu_score.SmoothingFunction().method3 + + for label, pred in zip(labels, preds): + maxed = mx.ndarray.argmax(data=pred, axis=1) + pred_nparr = maxed.asnumpy() + label_nparr = label.asnumpy().astype(np.int32) + sent_len, batch_size = np.shape(label_nparr) + pred_nparr = pred_nparr.reshape(sent_len, batch_size).astype(np.int32) + + for i in range(batch_size): + exp_lst = drop_sentinels(label_nparr[:, i].tolist()) + act_lst = drop_sentinels(pred_nparr[:, i].tolist()) + expected = exp_lst + actual = act_lst + bleu = nltk.translate.bleu_score.sentence_bleu( + references=[expected], hypothesis=actual, weights=(0.25, 0.25, 0.25, 0.25), + smoothing_function = smoothing_fn + ) +# print("bleu: %f" % bleu) + self.sum_metric += bleu + self.num_inst += 1 + assert label.size == pred.size/pred.shape[-1], \ + "shape mismatch: %s vs. %s"%(label.shape, pred.shape) + + def get(self): + num = self.num_inst if self.num_inst > 0 else float('nan') + return (self.name, self.sum_metric/num) + + +def infer(args): + assert args.model_prefix, "Must specifiy path to load from" + + data_test, src_vocab, inv_src_vocab, targ_vocab, inv_targ_vocab = get_data('TN', infer=True) + + print "len(src_vocab) len(targ_vocab)", len(src_vocab), len(targ_vocab) + + attention_fc_weight = mx.sym.Variable('attention_fc_weight') + attention_fc_bias = mx.sym.Variable('attention_fc_bias') + + fc_weight = mx.sym.Variable('fc_weight') + fc_bias = mx.sym.Variable('fc_bias') + targ_em_weight = mx.sym.Variable('targ_embed_weight') + + if args.use_cudnn_cells: + encoder = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout, + mode='lstm', prefix='lstm_encoder', bidirectional=args.bidirectional, get_next_state=True).unfuse() + + else: + encoder = SequentialRNNCell() + + for i in range(args.num_layers): + if args.bidirectional: + encoder.add( + BidirectionalCell( + LSTMCell(args.num_hidden // 2, prefix='rnn_encoder_f%d_' % i), + LSTMCell(args.num_hidden // 2, prefix='rnn_encoder_b%d_' % i))) + if i < args.num_layers - 1 and args.dropout > 0.0: + encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i)) + else: + encoder.add( + LSTMCell(args.num_hidden, prefix='rnn_encoder%d_' % i)) + if i < args.num_layers - 1 and args.dropout > 0.0: + encoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_encoder%d_' % i)) + + if args.use_cudnn_cells: + decoder = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, + mode='lstm', prefix='lstm_decoder', bidirectional=args.bidirectional, get_next_state=True).unfuse() + + else: + decoder = mx.rnn.SequentialRNNCell() + + for i in range(args.num_layers): + decoder.add(LSTMCell(args.num_hidden, prefix=('rnn_decoder%d_' % i))) + if i < args.num_layers - 1 and args.dropout > 0.0: + decoder.add(mx.rnn.DropoutCell(args.dropout, prefix='rnn_decoder%d_' % i)) + + def sym_gen(seq_len): + src_data = mx.sym.Variable('src_data') + targ_data = mx.sym.Variable('targ_data') + label = mx.sym.Variable('softmax_label') + + src_embed = mx.sym.Embedding(data=src_data, input_dim=len(src_vocab), + output_dim=args.num_embed, name='src_embed') + targ_embed = mx.sym.Embedding(data=targ_data, input_dim=len(targ_vocab), + weight = targ_em_weight, # data=data + output_dim=args.num_embed, name='targ_embed') + + encoder.reset() + decoder.reset() + + enc_seq_len, dec_seq_len = seq_len + + layout = 'TNC' + encoder_outputs, encoder_states = encoder.unroll(enc_seq_len, inputs=src_embed, layout=layout) + + if args.bidirectional: + encoder_states = [mx.sym.concat(encoder_states[0][0], encoder_states[0][1]), + mx.sym.concat(encoder_states[0][1], encoder_states[1][1])] + + # This should be based on EOS or max seq len for inference, but here we unroll to the target length + # TODO: fix symbol +# outputs, _ = decoder.unroll(dec_seq_len, targ_embed, begin_state=states, layout=layout, merge_outputs=True) + outputs, _ = infer_decoder_unroll(decoder, encoder_outputs, targ_embed, targ_vocab, dec_seq_len, 0, + fc_weight, fc_bias, + attention_fc_weight, attention_fc_bias, + targ_em_weight, + begin_state=encoder_states, layout='TNC', merge_outputs=True) + + # NEW + + rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1') + fc = mx.sym.FullyConnected(data=rs, weight=fc_weight, bias=fc_bias, num_hidden=len(targ_vocab), name='sym_gen_fc') + label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2') + pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax') + +# rs = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden), name='sym_gen_reshape1') +# fc = mx.sym.FullyConnected(data=rs, num_hidden=len(targ_vocab), name='sym_gen_fc') +# label_rs = mx.sym.Reshape(data=label, shape=(-1,), name='sym_gen_reshape2') +# pred = mx.sym.SoftmaxOutput(data=fc, label=label_rs, name='sym_gen_softmax') + + return pred, ('src_data', 'targ_data',), ('softmax_label',) + + if args.gpus: + contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] + else: + contexts = mx.cpu(0) + + model = mx.mod.BucketingModule( + sym_gen = sym_gen, + default_bucket_key = data_test.default_bucket_key, + context = contexts) + + model.bind(data_test.provide_data, data_test.provide_label, for_training=False) + + if args.load_epoch: + _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint( + [encoder, decoder], args.model_prefix, args.load_epoch) +# print(arg_params) + model.set_params(arg_params, aux_params) + + else: + arg_params = None + aux_params = None + + + opt_params = { + 'learning_rate': args.lr, + 'wd': args.wd + } + + if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']: + opt_params['momentum'] = args.mom + + opt_params['clip_gradient'] = args.max_grad_norm + + start = time() + + # mx.metric.Perplexity +# model.score(data_test, BleuScore(invalid_label), #mx.metric.Perplexity(invalid_label), +# batch_end_callback=mx.callback.Speedometer(batch_size=args.batch_size, frequent=1, auto_reset=True)) + + examples = [] + bleu_acc = 0.0 + num_inst = 0 + + try: + data_test.reset() + + smoothing_fn = nltk.translate.bleu_score.SmoothingFunction().method3 + + while True: + + data_batch = data_test.next() + model.forward(data_batch, is_train=None) + source = data_batch.data[0] + preds = model.get_outputs()[0] + labels = data_batch.label[0] + + maxed = mx.ndarray.argmax(data=preds, axis=1) + pred_nparr = maxed.asnumpy() + src_nparr = source.asnumpy() + label_nparr = labels.asnumpy().astype(np.int32) + sent_len, batch_size = np.shape(label_nparr) + pred_nparr = pred_nparr.reshape(sent_len, batch_size).astype(np.int32) + + for i in range(batch_size): + + src_lst = list(reversed(drop_sentinels(src_nparr[:, i].tolist()))) + exp_lst = drop_sentinels(label_nparr[:, i].tolist()) + act_lst = drop_sentinels(pred_nparr[:, i].tolist()) + + expected = exp_lst + actual = act_lst + bleu = nltk.translate.bleu_score.sentence_bleu( + references=[expected], hypothesis=actual, weights=(0.25, 0.25, 0.25, 0.25), + smoothing_function = smoothing_fn + ) + bleu_acc += bleu + num_inst += 1 + examples.append((src_lst, exp_lst, act_lst, bleu)) + + except StopIteration as se: + pass + + bleu_acc /= num_inst + + # Find the top K best translations + examples = sorted(examples, key=itemgetter(3), reverse=True) + + num_examples = 20 + + print("\nSample translations:\n") + for i in range(min(num_examples, len(examples))): + src_lst, exp_lst, act_lst, bleu = examples[i] + src_txt = array_to_text(src_lst, data_test.inv_src_vocab) + exp_txt = array_to_text(exp_lst, data_test.inv_targ_vocab) + act_txt = array_to_text(act_lst, data_test.inv_targ_vocab) + print("\n") + print("Source text: %s" % src_txt) + print("Expected translation: %s" % exp_txt) + print("Actual translation: %s" % act_txt) + print("\nTest set BLEU score (averaged over all examples): %.3f\n" % bleu_acc) + +if __name__ == '__main__': + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + + args = parser.parse_args() + + if args.input_feed: + assert (args.attention == True), "--input-feed is legal only with --attention!" + + # set random seeds for Python, NumPy and MxNet + import random + seed = args.seed + np.random.seed(seed) + random.seed(seed) + mx.random.seed(seed) + print("Using seed: %d" % seed) + + if args.gpus: + contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] + else: + contexts = mx.cpu(0) + + print("\n") + + if args.num_layers >= 4 and len(args.gpus.split(',')) >= 4 and not args.stack_rnn: + print('WARNING: stack-rnn is recommended to train complex model on multiple GPUs') + + if args.infer: + # Demonstrates how to load a model trained with CuDNN RNN and predict + # with non-fused MXNet symbol + infer(args) + else: + if args.inference_unrolling_for_training: + print("INFO: Using inference decoder unrolling for training") + else: + print("INFO: Using regular decoder unrolling for training") + train(args) diff --git a/seq2seq/seq2seq_iterator.py b/seq2seq_iterator.py similarity index 98% rename from seq2seq/seq2seq_iterator.py rename to seq2seq_iterator.py index 3fd096b..d851f32 100644 --- a/seq2seq/seq2seq_iterator.py +++ b/seq2seq_iterator.py @@ -102,17 +102,20 @@ def __init__( # else: # raise ValueError("Invalid layout %s: Must by NT (batch major) or TN (time major)") - def initialize(self): + def initialize(self, curr_batch_size=None): + if curr_batch_size: + self.batch_size = curr_batch_size + self.default_bucket_key = (self.default_bucket_key[0]+1, self.default_bucket_key[1]+1) if self.layout == 'TN': self.provide_data = [ mx.io.DataDesc(self.src_data_name, (self.default_bucket_key[0], self.batch_size), layout='TN'), - mx.io.DataDesc(self.targ_data_name, (self.default_bucket_key[0], self.batch_size), layout='TN') + mx.io.DataDesc(self.targ_data_name, (self.default_bucket_key[1], self.batch_size), layout='TN') ] self.provide_label = [mx.io.DataDesc(self.label_name, (self.default_bucket_key[1], self.batch_size), layout='TN')] elif self.layout == 'NT': self.provide_data = [ (self.src_data_name, (self.batch_size, self.default_bucket_key[0])), - (self.targ_data_name, (self.batch_size, self.default_bucket_key[0]))] + (self.targ_data_name, (self.batch_size, self.default_bucket_key[1]))] self.provide_label = [(self.label_name, (self.batch_size, self.default_bucket_key[1]))] else: raise ValueError("Invalid layout %s: Must by NT (batch major) or TN (time major)") diff --git a/speedometer_reset.patch b/speedometer_reset.patch deleted file mode 100644 index f284fd2..0000000 --- a/speedometer_reset.patch +++ /dev/null @@ -1,32 +0,0 @@ -diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py -index 396f5a1..544eab2 100644 ---- a/python/mxnet/callback.py -+++ b/python/mxnet/callback.py -@@ -96,13 +96,16 @@ class Speedometer(object): - frequent: int - How many batches between calculations. - Defaults to calculating & logging every 50 batches. -+ auto_reset : bool -+ Reset the metric after each log. - """ -- def __init__(self, batch_size, frequent=50): -+ def __init__(self, batch_size, frequent=50, auto_reset=False): - self.batch_size = batch_size - self.frequent = frequent - self.init = False - self.tic = 0 - self.last_count = 0 -+ self.auto_reset = auto_reset - - def __call__(self, param): - """Callback to Show speed.""" -@@ -116,7 +119,8 @@ class Speedometer(object): - speed = self.frequent * self.batch_size / (time.time() - self.tic) - if param.eval_metric is not None: - name_value = param.eval_metric.get_name_value() -- param.eval_metric.reset() -+ if self.auto_reset: -+ param.eval_metric.reset() - for name, value in name_value: - logging.info('Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f', - param.epoch, count, speed, name, value) diff --git a/seq2seq/split_train_valid.py b/split_train_valid.py similarity index 100% rename from seq2seq/split_train_valid.py rename to split_train_valid.py diff --git a/start_container.sh b/start_container.sh deleted file mode 100755 index 685abfc..0000000 --- a/start_container.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -nvidia-docker run --rm -it -v `pwd`:/mxnet_seq2seq -p 8888:8888 mxnet_seq2seq diff --git a/train_ptb/README.md b/train_ptb/README.md deleted file mode 100644 index 8a6f29d..0000000 --- a/train_ptb/README.md +++ /dev/null @@ -1,15 +0,0 @@ -RNN Example -=========== -This folder contains RNN examples using high level mxnet.rnn interface. - -Examples using low level symbol interface have been deprecated and moved to old/ - -## Data -Run `get_ptb_data.sh` to download PenTreeBank data. - -## Python - -- [lstm_bucketing.py](lstm_bucketing.py) PennTreeBank language model by using LSTM - -Performance Note: -More ```MXNET_GPU_WORKER_NTHREADS``` may lead to better performance. For setting ```MXNET_GPU_WORKER_NTHREADS```, please refer to [Environment Variables](https://mxnet.readthedocs.org/en/latest/how_to/env_var.html). diff --git a/train_ptb/cudnn_lstm_bucketing.py b/train_ptb/cudnn_lstm_bucketing.py deleted file mode 100644 index 8e0ad9d..0000000 --- a/train_ptb/cudnn_lstm_bucketing.py +++ /dev/null @@ -1,218 +0,0 @@ -import numpy as np -import mxnet as mx -import argparse - -parser = argparse.ArgumentParser(description="Train RNN on Penn Tree Bank", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--test', default=False, action='store_true', - help='whether to do testing instead of training') -parser.add_argument('--model-prefix', type=str, default=None, - help='path to save/load model') -parser.add_argument('--load-epoch', type=int, default=0, - help='load from epoch') -parser.add_argument('--num-layers', type=int, default=2, - help='number of stacked RNN layers') -parser.add_argument('--num-hidden', type=int, default=200, - help='hidden layer size') -parser.add_argument('--num-embed', type=int, default=200, - help='embedding layer size') -parser.add_argument('--bidirectional', type=bool, default=False, - help='whether to use bidirectional layers') -parser.add_argument('--gpus', type=str, - help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \ - 'Increase batch size when using multiple gpus for best performance.') -parser.add_argument('--kv-store', type=str, default='device', - help='key-value store type') -parser.add_argument('--num-epochs', type=int, default=25, - help='max num of epochs') -parser.add_argument('--lr', type=float, default=0.01, - help='initial learning rate') -parser.add_argument('--optimizer', type=str, default='sgd', - help='the optimizer type') -parser.add_argument('--mom', type=float, default=0.0, - help='momentum for sgd') -parser.add_argument('--wd', type=float, default=0.00001, - help='weight decay for sgd') -parser.add_argument('--batch-size', type=int, default=32, - help='the batch size.') -parser.add_argument('--disp-batches', type=int, default=50, - help='show progress for every n batches') -# When training a deep, complex model, it's recommended to stack fused RNN cells (one -# layer per cell) together instead of one with all layers. The reason is that fused RNN -# cells doesn't set gradients to be ready until the computation for the entire layer is -# completed. Breaking a multi-layer fused RNN cell into several one-layer ones allows -# gradients to be processed ealier. This reduces communication overhead, especially with -# multiple GPUs. -parser.add_argument('--stack-rnn', default=False, - help='stack fused RNN cells to reduce communication overhead') -parser.add_argument('--dropout', type=float, default='0.0', - help='dropout keep probability') - -#buckets = [32] -buckets = [10, 20, 30, 40, 50, 60] - -start_label = 1 -invalid_label = 0 - -def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0): - lines = open(fname).readlines() - lines = [filter(None, i.split(' ')) for i in lines] - sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label, start_label=start_label) - return sentences, vocab - -def get_data(layout): - train_sent, vocab = tokenize_text("./data/ptb.train.txt", start_label=start_label, - invalid_label=invalid_label) - val_sent, _ = tokenize_text("./data/ptb.test.txt", vocab=vocab, start_label=start_label, - invalid_label=invalid_label) - - data_train = mx.rnn.BucketSentenceIter(train_sent, args.batch_size, buckets=buckets, - invalid_label=invalid_label, layout=layout) - data_val = mx.rnn.BucketSentenceIter(val_sent, args.batch_size, buckets=buckets, - invalid_label=invalid_label, layout=layout) - return data_train, data_val, vocab - - -def train(args): - data_train, data_val, vocab = get_data('TN') - if args.stack_rnn: - stack = mx.rnn.SequentialRNNCell() - for layer in range(args.num_layers): - dropout = 0.0 - if layer < (args.num_layers - 1): - dropout = args.dropout - stack.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=1, - mode='lstm', prefix='lstm_%d'%layer, dropout=dropout, - bidirectional=args.bidirectional)) - cell = stack - else: - cell = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout, - mode='lstm', bidirectional=args.bidirectional) - - def sym_gen(seq_len): - data = mx.sym.Variable('data') - label = mx.sym.Variable('softmax_label') - embed = mx.sym.Embedding(data=data, input_dim=len(vocab), output_dim=args.num_embed,name='embed') - - output, _ = cell.unroll(seq_len, inputs=embed, merge_outputs=True, layout='TNC') - - pred = mx.sym.Reshape(output, - shape=(-1, args.num_hidden*(1+args.bidirectional))) - pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred') - - label = mx.sym.Reshape(label, shape=(-1,)) - pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') - - return pred, ('data',), ('softmax_label',) - - if args.gpus: - contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] - else: - contexts = mx.cpu(0) - - model = mx.mod.BucketingModule( - sym_gen = sym_gen, - default_bucket_key = data_train.default_bucket_key, - context = contexts) - - if args.load_epoch: - _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint( - cell, args.model_prefix, args.load_epoch) - else: - arg_params = None - aux_params = None - - opt_params = { - 'learning_rate': args.lr, - 'wd': args.wd - } - - if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']: - opt_params['momentum'] = args.mom - - model.fit( - train_data = data_train, - eval_data = data_val, - eval_metric = mx.metric.Perplexity(invalid_label), - kvstore = args.kv_store, - optimizer = args.optimizer, - optimizer_params = opt_params, - initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), - arg_params = arg_params, - aux_params = aux_params, - begin_epoch = args.load_epoch, - num_epoch = args.num_epochs, - batch_end_callback = mx.callback.Speedometer(args.batch_size, args.disp_batches), - epoch_end_callback = mx.rnn.do_rnn_checkpoint(cell, args.model_prefix, 1) - if args.model_prefix else None) - -def test(args): - assert args.model_prefix, "Must specifiy path to load from" - _, data_val, vocab = get_data('NT') - - if not args.stack_rnn: - stack = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, - mode='lstm', bidirectional=args.bidirectional).unfuse() - else: - stack = mx.rnn.SequentialRNNCell() - for i in range(args.num_layers): - cell = mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_%dl0_'%i) - if args.bidirectional: - cell = mx.rnn.BidirectionalCell( - cell, - mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_%dr0_'%i), - output_prefix='bi_lstm_%d'%i) - stack.add(cell) - - def sym_gen(seq_len): - data = mx.sym.Variable('data') - label = mx.sym.Variable('softmax_label') - embed = mx.sym.Embedding(data=data, input_dim=len(vocab), - output_dim=args.num_embed, name='embed') - - stack.reset() - outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True) - - pred = mx.sym.Reshape(outputs, - shape=(-1, args.num_hidden*(1+args.bidirectional))) - pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred') - - label = mx.sym.Reshape(label, shape=(-1,)) - pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') - - return pred, ('data',), ('softmax_label',) - - if args.gpus: - contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] - else: - contexts = mx.cpu(0) - - model = mx.mod.BucketingModule( - sym_gen = sym_gen, - default_bucket_key = data_val.default_bucket_key, - context = contexts) - model.bind(data_val.provide_data, data_val.provide_label, for_training=False) - - # note here we load using SequentialRNNCell instead of FusedRNNCell. - _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(stack, args.model_prefix, args.load_epoch) - model.set_params(arg_params, aux_params) - - model.score(data_val, mx.metric.Perplexity(invalid_label), - batch_end_callback=mx.callback.Speedometer(args.batch_size, 5)) - -if __name__ == '__main__': - import logging - head = '%(asctime)-15s %(message)s' - logging.basicConfig(level=logging.DEBUG, format=head) - - args = parser.parse_args() - - if args.num_layers >= 4 and len(args.gpus.split(',')) >= 4 and not args.stack_rnn: - print('WARNING: stack-rnn is recommended to train complex model on multiple GPUs') - - if args.test: - # Demonstrates how to load a model trained with CuDNN RNN and predict - # with non-fused MXNet symbol - test(args) - else: - train(args) diff --git a/train_ptb/get_ptb_data.sh b/train_ptb/get_ptb_data.sh deleted file mode 100755 index 1ec009a..0000000 --- a/train_ptb/get_ptb_data.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -RNN_DIR=$(cd `dirname $0`; pwd) -DATA_DIR="${RNN_DIR}/data/" - -if [[ ! -d "${DATA_DIR}" ]]; then - echo "${DATA_DIR} doesn't exist, will create one"; - mkdir -p ${DATA_DIR} -fi - -wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt; -wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt; -wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt; -wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt; diff --git a/train_ptb/lstm_bucketing.py b/train_ptb/lstm_bucketing.py deleted file mode 100644 index 4bc934a..0000000 --- a/train_ptb/lstm_bucketing.py +++ /dev/null @@ -1,107 +0,0 @@ -import numpy as np -import mxnet as mx -import argparse - -parser = argparse.ArgumentParser(description="Train RNN on Penn Tree Bank", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--num-layers', type=int, default=2, - help='number of stacked RNN layers') -parser.add_argument('--num-hidden', type=int, default=200, - help='hidden layer size') -parser.add_argument('--num-embed', type=int, default=200, - help='embedding layer size') -parser.add_argument('--gpus', type=str, - help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \ - 'Increase batch size when using multiple gpus for best performance.') -parser.add_argument('--kv-store', type=str, default='device', - help='key-value store type') -parser.add_argument('--num-epochs', type=int, default=25, - help='max num of epochs') -parser.add_argument('--lr', type=float, default=0.01, - help='initial learning rate') -parser.add_argument('--optimizer', type=str, default='sgd', - help='the optimizer type') -parser.add_argument('--mom', type=float, default=0.0, - help='momentum for sgd') -parser.add_argument('--wd', type=float, default=0.00001, - help='weight decay for sgd') -parser.add_argument('--batch-size', type=int, default=32, - help='the batch size.') -parser.add_argument('--disp-batches', type=int, default=50, - help='show progress for every n batches') - - -def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0): - lines = open(fname).readlines() - lines = [filter(None, i.split(' ')) for i in lines] - sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label, - start_label=start_label) - return sentences, vocab - - -if __name__ == '__main__': - import logging - head = '%(asctime)-15s %(message)s' - logging.basicConfig(level=logging.DEBUG, format=head) - - args = parser.parse_args() - - #buckets = [] - buckets = [10, 20, 30, 40, 50, 60] - - start_label = 1 - invalid_label = 0 - - train_sent, vocab = tokenize_text("./data/ptb.train.txt", start_label=start_label, - invalid_label=invalid_label) - val_sent, _ = tokenize_text("./data/ptb.test.txt", vocab=vocab, start_label=start_label, - invalid_label=invalid_label) - - data_train = mx.rnn.BucketSentenceIter(train_sent, args.batch_size, buckets=buckets, - invalid_label=invalid_label) - data_val = mx.rnn.BucketSentenceIter(val_sent, args.batch_size, buckets=buckets, - invalid_label=invalid_label) - - stack = mx.rnn.SequentialRNNCell() - for i in range(args.num_layers): - stack.add(mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_l%d_'%i)) - - def sym_gen(seq_len): - data = mx.sym.Variable('data') - label = mx.sym.Variable('softmax_label') - embed = mx.sym.Embedding(data=data, input_dim=len(vocab), - output_dim=args.num_embed, name='embed') - - stack.reset() - outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True) - - pred = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden)) - pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred') - - label = mx.sym.Reshape(label, shape=(-1,)) - pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') - - return pred, ('data',), ('softmax_label',) - - if args.gpus: - contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] - else: - contexts = mx.cpu(0) - - model = mx.mod.BucketingModule( - sym_gen = sym_gen, - default_bucket_key = data_train.default_bucket_key, - context = contexts) - - model.fit( - train_data = data_train, - eval_data = data_val, - eval_metric = mx.metric.Perplexity(invalid_label), - kvstore = args.kv_store, - optimizer = args.optimizer, - optimizer_params = { 'learning_rate': args.lr, - 'momentum': args.mom, - 'wd': args.wd }, - initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), - num_epoch = args.num_epochs, - batch_end_callback = mx.callback.Speedometer(args.batch_size, args.disp_batches)) diff --git a/seq2seq/utils.py b/utils.py similarity index 83% rename from seq2seq/utils.py rename to utils.py index 237547d..28be52f 100644 --- a/seq2seq/utils.py +++ b/utils.py @@ -15,8 +15,8 @@ Dataset = namedtuple( 'Dataset', - ['src_train_sent', 'src_valid_sent', 'src_vocab', 'inv_src_vocab', - 'targ_train_sent', 'targ_valid_sent', 'targ_vocab', 'inv_targ_vocab']) + ['src_train_sent', 'src_valid_sent', 'src_test_sent', 'src_vocab', 'inv_src_vocab', + 'targ_train_sent', 'targ_valid_sent', 'targ_test_sent', 'targ_vocab', 'inv_targ_vocab']) def invert_dict(d): return {v: k for k, v in d.iteritems()} @@ -91,7 +91,8 @@ def array_to_text(array, inv_vocab): sent.append(inv_vocab[token]) return " ".join(sent) -def get_s2s_data(src_train_path, src_valid_path, targ_train_path, targ_valid_path, +def get_s2s_data(src_train_path, src_valid_path, src_test_path, targ_train_path, + targ_valid_path, targ_test_path, reserved_tokens=['', '', '', '']): print("Creating joint source dictionary") @@ -99,8 +100,10 @@ def get_s2s_data(src_train_path, src_valid_path, targ_train_path, targ_valid_pat print("Tokenizing src_train_path") src_train_sent = tokenize_text(src_train_path, vocab=src_dict) - print("Tokenizing targ_train_path") + print("Tokenizing src_valid_path") src_valid_sent = tokenize_text(src_valid_path, vocab=src_dict) + print("Tokenizing src_test_path") + src_test_sent = tokenize_text(src_test_path, vocab=src_dict) print("Creating joint target dictionary") targ_dict, inv_targ_dict = top_words_train_valid(targ_train_path, targ_valid_path) @@ -109,6 +112,8 @@ def get_s2s_data(src_train_path, src_valid_path, targ_train_path, targ_valid_pat targ_train_sent = tokenize_text(targ_train_path, vocab=targ_dict) print("Tokenizing targ_valid_path") targ_valid_sent = tokenize_text(targ_valid_path, vocab=targ_dict) + print("Tokenizing targ_test_path") + targ_test_sent = tokenize_text(targ_test_path, vocab=targ_dict) print("\nEncoded source language sentences:\n") for i in range(5): @@ -120,6 +125,7 @@ def get_s2s_data(src_train_path, src_valid_path, targ_train_path, targ_valid_pat return Dataset( - src_train_sent=src_train_sent, src_valid_sent=src_valid_sent, src_vocab=src_dict, inv_src_vocab=inv_src_dict, - targ_train_sent=targ_train_sent, targ_valid_sent=targ_valid_sent, targ_vocab=targ_dict, inv_targ_vocab=inv_targ_dict) + src_train_sent=src_train_sent, src_valid_sent=src_valid_sent, src_test_sent=src_test_sent, + src_vocab=src_dict, inv_src_vocab=inv_src_dict, targ_train_sent=targ_train_sent, + targ_valid_sent=targ_valid_sent, targ_test_sent=targ_test_sent, targ_vocab=targ_dict, inv_targ_vocab=inv_targ_dict)