force set lf instead of crlf (https://github.com/pre-commit/pre-commit-hooks#mixed-line-ending) (#1598)

2025-10-20 15:33:51 +08:00 · 2021-11-29 15:41:18 -08:00
parent c3f1d82b07
commit a10e4811fe
88 changed files with 21928 additions and 21927 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,155 +1,155 @@
---
-# Refer to the following link for the explanation of each params:
-#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
-Language: Cpp
-# BasedOnStyle: Google
-AccessModifierOffset: -4
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands: true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: false
-AllowShortBlocksOnASingleLine: true
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
-# This is deprecated
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments:  false
-BinPackParameters: false
-BraceWrapping:
-  AfterClass:            false
-  AfterControlStatement: false
-  AfterEnum:             false
-  AfterFunction:         false
-  AfterNamespace:        false
-  AfterObjCDeclaration:  false
-  AfterStruct:           false
-  AfterUnion:            false
-  AfterExternBlock:      false
-  BeforeCatch:           false
-  BeforeElse:            false
-  IndentBraces:          false
-  # disabling the below splits, else, they'll just add to the vertical length of source files!
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: WebKit
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit: 100
-CommentPragmas: '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat: false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks: Preserve
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 4
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 4
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-RawStringFormats:
-  - Language: Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-  - Language: TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle: google
-# Enabling comment reflow causes doxygen comments to be messed up in their formats!
-ReflowComments: true
-SortIncludes: true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles: false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard: Cpp11
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-# Be consistent with indent-width, even for people who use tab for indentation!
-TabWidth: 4
-UseTab: Never
+---
+# Refer to the following link for the explanation of each params:
+#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+Language: Cpp
+# BasedOnStyle: Google
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+# This is deprecated
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments:  false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:            false
+  AfterControlStatement: false
+  AfterEnum:             false
+  AfterFunction:         false
+  AfterNamespace:        false
+  AfterObjCDeclaration:  false
+  AfterStruct:           false
+  AfterUnion:            false
+  AfterExternBlock:      false
+  BeforeCatch:           false
+  BeforeElse:            false
+  IndentBraces:          false
+  # disabling the below splits, else, they'll just add to the vertical length of source files!
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: WebKit
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 4
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+# Enabling comment reflow causes doxygen comments to be messed up in their formats!
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+# Be consistent with indent-width, even for people who use tab for indentation!
+TabWidth: 4
+UseTab: Never
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@ -1,35 +1,35 @@
-name: Formatting
-
-on:
-  push:
-    branches:
-      - 'master'
-      - 'staging**'
-  pull_request:
-    branches:
-      '**'
-
-jobs:
-
-  # formatting and basic install on cpu-only machine
-  formatting:
-    runs-on: ubuntu-20.04
-
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: environment
-        run: |
-          which python
-          python --version
-          pip install torch==1.9.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-
-      - name: Install deepspeed
-        run: |
-          pip install .[dev,autotuning]
-          ds_report
-
-      - name: Formatting checks
-        run: |
-           pre-commit run --all-files
+name: Formatting
+
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+  pull_request:
+    branches:
+      '**'
+
+jobs:
+
+  # formatting and basic install on cpu-only machine
+  formatting:
+    runs-on: ubuntu-20.04
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: environment
+        run: |
+          which python
+          python --version
+          pip install torch==1.9.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,autotuning]
+          ds_report
+
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -4,13 +4,15 @@ repos:
    rev: v1.2.3
    hooks:
    -   id: trailing-whitespace
-        exclude: "examples/"
+        exclude: "DeepSpeedExamples/"
    -   id: check-yaml
-        exclude: "examples/"
+        exclude: "DeepSpeedExamples/"
    -   id: end-of-file-fixer
-        exclude: "examples/"
+        exclude: "DeepSpeedExamples/"
        exclude: "docs/CNAME"
-
+    -   id: mixed-line-ending
+        exclude: "DeepSpeedExamples/"
+        args: [--fix=lf]

 -   repo: https://github.com/pre-commit/mirrors-yapf
    rev: v0.29.0
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -1,9 +1,9 @@
-# Microsoft Open Source Code of Conduct
-
-This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
-
-Resources:
-
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
+# Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+
+Resources:
+
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
--- a/42
+++ b/42
@ -1,21 +1,21 @@
-    MIT License
-
-    Copyright (c) Microsoft Corporation.
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to deal
-    in the Software without restriction, including without limitation the rights
-    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-    copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in all
-    copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-    SOFTWARE
+    MIT License
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,41 +1,41 @@
-<!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->
-
-## Security
-
-Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
-
-If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below.
-
-## Reporting Security Issues
-
-**Please do not report security vulnerabilities through public GitHub issues.**
-
-Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
-
-If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
-
-You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
-
-Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
-
-  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
-  * Full paths of source file(s) related to the manifestation of the issue
-  * The location of the affected source code (tag/branch/commit or direct URL)
-  * Any special configuration required to reproduce the issue
-  * Step-by-step instructions to reproduce the issue
-  * Proof-of-concept or exploit code (if possible)
-  * Impact of the issue, including how an attacker might exploit the issue
-
-This information will help us triage your report more quickly.
-
-If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
-
-## Preferred Languages
-
-We prefer all communications to be in English.
-
-## Policy
-
-Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
-
-<!-- END MICROSOFT SECURITY.MD BLOCK -->
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
--- a/csrc/adagrad/cpu_adagrad.cpp
+++ b/csrc/adagrad/cpu_adagrad.cpp
@ -1,227 +1,227 @@
-#include "cpu_adagrad.h"
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adagrad_Optimizer::Step_1(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float step_size = -1 * _alpha;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = grads[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
-
-                variance += grad * grad;
-
-                grad = sqrt(variance);
-                grad += _eps;
-                grad = momentum / grad;
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                // STORE UPDATE TERM TO GRAD'S MEMORY
-                grads[k] = grad * step_size;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adagrad_Optimizer::Step_4(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adagrad_optimizer(int optimizer_id,
-                             float alpha = 1e-2,
-                             float eps = 1e-8,
-                             float weight_decay = 0,
-                             bool should_log = false)
-{
-    auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
-    }
-
-    return 0;
-}
-
-void Adagrad_Optimizer::Step_8(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adagrad_step(int optimizer_id,
-                    size_t step,
-                    float lr,
-                    float epsilon,
-                    float weight_decay,
-                    torch::Tensor& params,
-                    torch::Tensor& grads,
-                    torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adagrad_step_plus_copy(int optimizer_id,
-                              size_t step,
-                              float lr,
-                              float epsilon,
-                              float weight_decay,
-                              torch::Tensor& params,
-                              torch::Tensor& grads,
-                              torch::Tensor& exp_avg_sq,
-                              torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adagrad_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
-    m.def("adagrad_update_copy",
-          &ds_adagrad_step_plus_copy,
-          "DeepSpeed CPU Adagrad update and param copy (C++)");
-    m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
-    m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
-}
+#include "cpu_adagrad.h"
+#include <cuda_runtime_api.h>
+#include <math.h>
+#include <omp.h>
+#include <torch/extension.h>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+#include "custom_cuda_layers.h"
+
+static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
+
+// C++ interface
+
+void Adagrad_Optimizer::Step_1(float* _params,
+                               float* grads,
+                               float* _exp_avg_sq,
+                               size_t _param_size,
+                               __half* dev_params,
+                               bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<1>(
+        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size) {
+        float step_size = -1 * _alpha;
+        __half* grads_cast_h;
+        __half* params_cast_h;
+        if (half_precision) {
+            grads_cast_h = reinterpret_cast<__half*>(grads);
+            params_cast_h = reinterpret_cast<__half*>(_params);
+        }
+        for (size_t t = rounded_size; t < _param_size; t += TILE) {
+            size_t copy_size = TILE;
+            if ((t + TILE) > _param_size) copy_size = _param_size - t;
+            size_t offset = copy_size + t;
+            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#pragma omp parallel for
+            for (size_t k = t; k < offset; k++) {
+                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
+                float param = half_precision ? (float)params_cast_h[k] : _params[k];
+                float momentum = grads[k];
+                float variance = _exp_avg_sq[k];
+                if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
+
+                variance += grad * grad;
+
+                grad = sqrt(variance);
+                grad += _eps;
+                grad = momentum / grad;
+                param = grad * step_size + param;
+                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
+
+                if (half_precision)
+                    params_cast_h[k] = (__half)param;
+                else
+                    _params[k] = param;
+                // STORE UPDATE TERM TO GRAD'S MEMORY
+                grads[k] = grad * step_size;
+                _exp_avg_sq[k] = variance;
+            }
+            if (dev_params) {
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
+                _buf_index = !_buf_index;
+            }
+        }
+    }
+}
+
+void Adagrad_Optimizer::Step_4(float* _params,
+                               float* grads,
+                               float* _exp_avg_sq,
+                               size_t _param_size,
+                               __half* dev_params,
+                               bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<4>(
+        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_1((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int create_adagrad_optimizer(int optimizer_id,
+                             float alpha = 1e-2,
+                             float eps = 1e-8,
+                             float weight_decay = 0,
+                             bool should_log = false)
+{
+    auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
+
+    s_optimizers[optimizer_id] = opt;
+
+    if (should_log) {
+        std::string avx_type = "";
+#if defined(__AVX512__)
+        avx_type = "AVX512";
+#else
+#if defined(__AVX256__)
+        avx_type = "AVX2";
+#else
+        avx_type = "scalar";
+#endif
+#endif
+
+        printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
+               optimizer_id,
+               avx_type.c_str());
+        printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
+    }
+
+    return 0;
+}
+
+void Adagrad_Optimizer::Step_8(float* _params,
+                               float* grads,
+                               float* _exp_avg_sq,
+                               size_t _param_size,
+                               __half* dev_params,
+                               bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<8>(
+        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_4((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int ds_adagrad_step(int optimizer_id,
+                    size_t step,
+                    float lr,
+                    float epsilon,
+                    float weight_decay,
+                    torch::Tensor& params,
+                    torch::Tensor& grads,
+                    torch::Tensor& exp_avg_sq)
+{
+    auto params_c = params.contiguous();
+    auto grads_c = grads.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adagrad_Optimizer> opt =
+        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step);
+    opt->update_state(lr, epsilon, weight_decay);
+    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
+
+    opt->SynchronizeStreams();
+    return 0;
+}
+
+int ds_adagrad_step_plus_copy(int optimizer_id,
+                              size_t step,
+                              float lr,
+                              float epsilon,
+                              float weight_decay,
+                              torch::Tensor& params,
+                              torch::Tensor& grads,
+                              torch::Tensor& exp_avg_sq,
+                              torch::Tensor& gpu_params)
+{
+    auto params_c = params.contiguous();
+    auto gpu_params_c = gpu_params.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+    auto grads_c = grads.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adagrad_Optimizer> opt =
+        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step);
+    opt->update_state(lr, epsilon, weight_decay);
+    opt->Step_8(params_ptr,
+                grads_ptr,
+                exp_avg_sq_ptr,
+                params_c.size(0),
+                gpu_params_ptr,
+                (params.options().dtype() == at::kHalf));
+
+    opt->SynchronizeStreams();
+    return 0;
+}
+
+int destroy_adagrad_optimizer(int optimizer_id)
+{
+    s_optimizers.erase(optimizer_id);
+
+    return 0;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
+    m.def("adagrad_update_copy",
+          &ds_adagrad_step_plus_copy,
+          "DeepSpeed CPU Adagrad update and param copy (C++)");
+    m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
+    m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
+}
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@ -1,292 +1,292 @@
-#include "cpu_adam.h"
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adam_Optimizer::Step_1(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float betta1_minus1 = 1 - _betta1;
-        float betta2_minus1 = 1 - _betta2;
-
-        float step_size = -1 * _alpha / _bias_correction1;
-        float w_decay = -1 * _alpha * _weight_decay;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = _exp_avg[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
-                momentum = momentum * _betta1;
-                momentum = grad * betta1_minus1 + momentum;
-
-                variance = variance * _betta2;
-                grad = grad * grad;
-                variance = grad * betta2_minus1 + variance;
-
-                grad = sqrt(variance);
-                grad = grad * _bias_correction2 + _eps;
-                grad = momentum / grad;
-                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                _exp_avg[k] = momentum;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adam_Optimizer::Step_4(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adam_optimizer(int optimizer_id,
-                          float alpha = 1e-3,
-                          float betta1 = 0.9,
-                          float betta2 = 0.999,
-                          float eps = 1e-8,
-                          float weight_decay = 0,
-                          bool adamw_mode = true,
-                          bool should_log = false)
-{
-    auto opt =
-        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
-               alpha,
-               betta1,
-               betta2,
-               weight_decay,
-               (int)adamw_mode);
-    }
-
-    return 0;
-}
-
-void Adam_Optimizer::Step_8(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adam_step(int optimizer_id,
-                 size_t step,
-                 float lr,
-                 float beta1,
-                 float beta2,
-                 float epsilon,
-                 float weight_decay,
-                 bool bias_correction,
-                 torch::Tensor& params,
-                 torch::Tensor& grads,
-                 torch::Tensor& exp_avg,
-                 torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    // assert(params.options().dtype() == grads.options().dtype());
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                nullptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adam_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float epsilon,
-                           float weight_decay,
-                           bool bias_correction,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& exp_avg_sq,
-                           torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adam_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
-    m.def("adam_update_copy",
-          &ds_adam_step_plus_copy,
-          "DeepSpeed CPU Adam update and param copy (C++)");
-    m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
-    m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
-}
+#include "cpu_adam.h"
+#include <cuda_runtime_api.h>
+#include <math.h>
+#include <omp.h>
+#include <torch/extension.h>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+#include "custom_cuda_layers.h"
+
+static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
+
+// C++ interface
+
+void Adam_Optimizer::Step_1(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            __half* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<1>(&rounded_size,
+                _params,
+                grads,
+                _exp_avg,
+                _exp_avg_sq,
+                _param_size,
+                dev_params,
+                half_precision);
+#endif
+    if (_param_size > rounded_size) {
+        float betta1_minus1 = 1 - _betta1;
+        float betta2_minus1 = 1 - _betta2;
+
+        float step_size = -1 * _alpha / _bias_correction1;
+        float w_decay = -1 * _alpha * _weight_decay;
+        __half* grads_cast_h;
+        __half* params_cast_h;
+        if (half_precision) {
+            grads_cast_h = reinterpret_cast<__half*>(grads);
+            params_cast_h = reinterpret_cast<__half*>(_params);
+        }
+
+        for (size_t t = rounded_size; t < _param_size; t += TILE) {
+            size_t copy_size = TILE;
+            if ((t + TILE) > _param_size) copy_size = _param_size - t;
+            size_t offset = copy_size + t;
+            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+
+#pragma omp parallel for
+            for (size_t k = t; k < offset; k++) {
+                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
+                float param = half_precision ? (float)params_cast_h[k] : _params[k];
+                float momentum = _exp_avg[k];
+                float variance = _exp_avg_sq[k];
+                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
+                momentum = momentum * _betta1;
+                momentum = grad * betta1_minus1 + momentum;
+
+                variance = variance * _betta2;
+                grad = grad * grad;
+                variance = grad * betta2_minus1 + variance;
+
+                grad = sqrt(variance);
+                grad = grad * _bias_correction2 + _eps;
+                grad = momentum / grad;
+                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
+                param = grad * step_size + param;
+                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
+
+                if (half_precision)
+                    params_cast_h[k] = (__half)param;
+                else
+                    _params[k] = param;
+                _exp_avg[k] = momentum;
+                _exp_avg_sq[k] = variance;
+            }
+            if (dev_params) {
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
+
+                _buf_index = !_buf_index;
+            }
+        }
+    }
+}
+
+void Adam_Optimizer::Step_4(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            __half* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<4>(&rounded_size,
+                _params,
+                grads,
+                _exp_avg,
+                _exp_avg_sq,
+                _param_size,
+                dev_params,
+                half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_1((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int create_adam_optimizer(int optimizer_id,
+                          float alpha = 1e-3,
+                          float betta1 = 0.9,
+                          float betta2 = 0.999,
+                          float eps = 1e-8,
+                          float weight_decay = 0,
+                          bool adamw_mode = true,
+                          bool should_log = false)
+{
+    auto opt =
+        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
+
+    s_optimizers[optimizer_id] = opt;
+
+    if (should_log) {
+        std::string avx_type = "";
+#if defined(__AVX512__)
+        avx_type = "AVX512";
+#else
+#if defined(__AVX256__)
+        avx_type = "AVX2";
+#else
+        avx_type = "scalar";
+#endif
+#endif
+
+        printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
+               optimizer_id,
+               avx_type.c_str());
+        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
+               alpha,
+               betta1,
+               betta2,
+               weight_decay,
+               (int)adamw_mode);
+    }
+
+    return 0;
+}
+
+void Adam_Optimizer::Step_8(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            __half* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<8>(&rounded_size,
+                _params,
+                grads,
+                _exp_avg,
+                _exp_avg_sq,
+                _param_size,
+                dev_params,
+                half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_4((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int ds_adam_step(int optimizer_id,
+                 size_t step,
+                 float lr,
+                 float beta1,
+                 float beta2,
+                 float epsilon,
+                 float weight_decay,
+                 bool bias_correction,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg,
+                 torch::Tensor& exp_avg_sq)
+{
+    auto params_c = params.contiguous();
+    auto grads_c = grads.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+
+    // assert(params.options().dtype() == grads.options().dtype());
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adam_Optimizer> opt =
+        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, epsilon, weight_decay, bias_correction);
+
+    opt->Step_8(params_ptr,
+                grads_ptr,
+                exp_avg_ptr,
+                exp_avg_sq_ptr,
+                params_c.size(0),
+                nullptr,
+                (params.options().dtype() == at::kHalf));
+
+    opt->SynchronizeStreams();
+    return 0;
+}
+
+int ds_adam_step_plus_copy(int optimizer_id,
+                           size_t step,
+                           float lr,
+                           float beta1,
+                           float beta2,
+                           float epsilon,
+                           float weight_decay,
+                           bool bias_correction,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& exp_avg_sq,
+                           torch::Tensor& gpu_params)
+{
+    auto params_c = params.contiguous();
+    auto gpu_params_c = gpu_params.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+    auto grads_c = grads.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adam_Optimizer> opt =
+        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, epsilon, weight_decay, bias_correction);
+    opt->Step_8(params_ptr,
+                grads_ptr,
+                exp_avg_ptr,
+                exp_avg_sq_ptr,
+                params_c.size(0),
+                gpu_params_ptr,
+                (params.options().dtype() == at::kHalf));
+
+    opt->SynchronizeStreams();
+    return 0;
+}
+
+int destroy_adam_optimizer(int optimizer_id)
+{
+    s_optimizers.erase(optimizer_id);
+
+    return 0;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
+    m.def("adam_update_copy",
+          &ds_adam_step_plus_copy,
+          "DeepSpeed CPU Adam update and param copy (C++)");
+    m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
+    m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
+}
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@ -1,333 +1,333 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <libaio.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "deepspeed_aio_common.h"
-
-using namespace std;
-using namespace std::chrono;
-
-#define DEBUG_DS_AIO_PERF 0
-#define DEBUG_DS_AIO_SUBMIT_PERF 0
-
-static const std::string c_library_name = "deepspeed_aio";
-
-static void _report_aio_statistics(const char* tag,
-                                   const std::vector<std::chrono::duration<double>>& latencies)
-    __attribute__((unused));
-
-static void _report_aio_statistics(const char* tag,
-                                   const std::vector<std::chrono::duration<double>>& latencies)
-{
-    std::vector<double> lat_usec;
-    for (auto& lat : latencies) { lat_usec.push_back(lat.count() * 1e6); }
-    const auto min_lat = *(std::min_element(lat_usec.begin(), lat_usec.end()));
-    const auto max_lat = *(std::max_element(lat_usec.begin(), lat_usec.end()));
-    const auto avg_lat = std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
-
-    std::cout << c_library_name << ": latency statistics(usec) " << tag
-              << " min/max/avg = " << min_lat << " " << max_lat << " " << avg_lat << std::endl;
-}
-
-static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_latencies,
-                               struct deepspeed_aio_latency_t& summary_latencies)
-{
-    std::vector<double> lat_usec;
-    for (auto& lat : raw_latencies) { lat_usec.push_back(lat.count() * 1e6); }
-    summary_latencies._min_usec = *(std::min_element(lat_usec.begin(), lat_usec.end()));
-    summary_latencies._max_usec = *(std::max_element(lat_usec.begin(), lat_usec.end()));
-    summary_latencies._avg_usec =
-        std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
-}
-
-static void _do_io_submit_singles(const long long int n_iocbs,
-                                  const long long int iocb_index,
-                                  std::unique_ptr<aio_context>& aio_ctxt,
-                                  std::vector<std::chrono::duration<double>>& submit_times)
-{
-    for (auto i = 0; i < n_iocbs; ++i) {
-        const auto st = std::chrono::high_resolution_clock::now();
-        const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, 1, aio_ctxt->_iocbs.data() + i);
-        submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
-#if DEBUG_DS_AIO_SUBMIT_PERF
-        printf("submit(usec) %f io_index=%lld buf=%p len=%lu off=%llu \n",
-               submit_times.back().count() * 1e6,
-               iocb_index,
-               aio_ctxt->_iocbs[i]->u.c.buf,
-               aio_ctxt->_iocbs[i]->u.c.nbytes,
-               aio_ctxt->_iocbs[i]->u.c.offset);
-#endif
-        assert(submit_ret > 0);
-    }
-}
-
-static void _do_io_submit_block(const long long int n_iocbs,
-                                const long long int iocb_index,
-                                std::unique_ptr<aio_context>& aio_ctxt,
-                                std::vector<std::chrono::duration<double>>& submit_times)
-{
-    const auto st = std::chrono::high_resolution_clock::now();
-    const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, n_iocbs, aio_ctxt->_iocbs.data());
-    submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
-#if DEBUG_DS_AIO_SUBMIT_PERF
-    printf("submit(usec) %f io_index=%lld nr=%lld buf=%p len=%lu off=%llu \n",
-           submit_times.back().count() * 1e6,
-           iocb_index,
-           n_iocbs,
-           aio_ctxt->_iocbs[0]->u.c.buf,
-           aio_ctxt->_iocbs[0]->u.c.nbytes,
-           aio_ctxt->_iocbs[0]->u.c.offset);
-#endif
-    assert(submit_ret > 0);
-}
-
-static int _do_io_complete(const long long int min_completes,
-                           const long long int max_completes,
-                           std::unique_ptr<aio_context>& aio_ctxt,
-                           std::vector<std::chrono::duration<double>>& reap_times)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-    const auto n_completes = io_getevents(
-        aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
-    reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
-
-    assert(n_completes >= min_completes);
-    return n_completes;
-}
-
-void do_aio_operation_sequential(const bool read_op,
-                                 std::unique_ptr<aio_context>& aio_ctxt,
-                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                 deepspeed_aio_config_t* config,
-                                 deepspeed_aio_perf_t* perf)
-{
-    struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
-
-    const auto num_io_blocks = static_cast<long long int>(
-        ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
-#if DEBUG_DS_AIO_PERF
-    const auto io_op_name = std::string(read_op ? "read" : "write");
-    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes with " << num_io_blocks << " io blocks" << std::endl;
-#endif
-
-    std::vector<std::chrono::duration<double>> submit_times;
-    std::vector<std::chrono::duration<double>> reap_times;
-    const auto max_queue_bytes =
-        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for (long long iocb_index = 0; iocb_index < num_io_blocks;
-         iocb_index += aio_ctxt->_queue_depth) {
-        const auto start_offset = iocb_index * aio_ctxt->_block_size;
-        const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
-        const auto n_iocbs =
-            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
-        const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
-        prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
-
-        if (config->_single_submit) {
-            _do_io_submit_singles(n_iocbs, iocb_index, aio_ctxt, submit_times);
-        } else {
-            _do_io_submit_block(n_iocbs, iocb_index, aio_ctxt, submit_times);
-        }
-
-        _do_io_complete(n_iocbs, n_iocbs, aio_ctxt, reap_times);
-    }
-    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
-
-    if (perf) {
-        _get_aio_latencies(submit_times, perf->_submit);
-        _get_aio_latencies(reap_times, perf->_complete);
-        perf->_e2e_usec = elapsed.count() * 1e6;
-        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
-    }
-
-#if DEBUG_DS_AIO_PERF
-    _report_aio_statistics("submit", submit_times);
-    _report_aio_statistics("complete", reap_times);
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
-              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes " << std::endl;
-#endif
-}
-
-void do_aio_operation_overlap(const bool read_op,
-                              std::unique_ptr<aio_context>& aio_ctxt,
-                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                              deepspeed_aio_config_t* config,
-                              deepspeed_aio_perf_t* perf)
-{
-    struct io_prep_generator io_gen(read_op, xfer_ctxt, aio_ctxt->_block_size);
-
-#if DEBUG_DS_AIO_PERF
-    const auto io_op_name = std::string(read_op ? "read" : "write");
-    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes with " << io_gen._num_io_blocks << " io blocks" << std::endl;
-#endif
-
-    std::vector<std::chrono::duration<double>> submit_times;
-    std::vector<std::chrono::duration<double>> reap_times;
-
-    auto request_iocbs = aio_ctxt->_queue_depth;
-    auto n_pending_iocbs = 0;
-    const auto min_completes = 1;
-    auto start = std::chrono::high_resolution_clock::now();
-    while (true) {
-        const auto n_iocbs = io_gen.prep_iocbs(request_iocbs - n_pending_iocbs, &aio_ctxt->_iocbs);
-        if (n_iocbs > 0) {
-            if (config->_single_submit) {
-                _do_io_submit_singles(
-                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
-            } else {
-                _do_io_submit_block(
-                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
-            }
-        }
-
-        n_pending_iocbs += n_iocbs;
-        assert(n_pending_iocbs <= aio_ctxt->_queue_depth);
-
-        if (n_pending_iocbs == 0) { break; }
-
-        const auto n_complete =
-            _do_io_complete(min_completes, n_pending_iocbs, aio_ctxt, reap_times);
-        n_pending_iocbs -= n_complete;
-    }
-
-    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
-
-    if (perf) {
-        _get_aio_latencies(submit_times, perf->_submit);
-        _get_aio_latencies(reap_times, perf->_complete);
-        perf->_e2e_usec = elapsed.count() * 1e6;
-        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
-    }
-
-#if DEBUG_DS_AIO_PERF
-    _report_aio_statistics("submit", submit_times);
-    _report_aio_statistics("complete", reap_times);
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
-              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes " << std::endl;
-#endif
-}
-
-void report_file_error(const char* filename, const std::string file_op, const int error_code)
-{
-    std::string err_msg = file_op + std::string(" failed on ") + std::string(filename) +
-                          " error = " + std::to_string(error_code);
-    std::cerr << c_library_name << ":  " << err_msg << std::endl;
-}
-
-int open_file(const char* filename, const bool read_op)
-{
-    const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
-    const int mode = 0600;
-    const auto fd = open(filename, flags, mode);
-    if (fd == -1) {
-        const auto error_code = errno;
-        const auto error_msg = read_op ? " open for read " : " open for write ";
-        report_file_error(filename, error_msg, error_code);
-        return -1;
-    }
-    return fd;
-}
-
-int regular_read(const char* filename, std::vector<char>& buffer)
-{
-    long long int num_bytes;
-    const auto f_size = get_file_size(filename, num_bytes);
-    assert(f_size != -1);
-    buffer.resize(num_bytes);
-    const auto fd = open(filename, O_RDONLY, 0600);
-    assert(fd != -1);
-    long long int read_bytes = 0;
-    auto r = 0;
-    do {
-        const auto buffer_ptr = buffer.data() + read_bytes;
-        const auto bytes_to_read = num_bytes - read_bytes;
-        r = read(fd, buffer_ptr, bytes_to_read);
-        read_bytes += r;
-    } while (r > 0);
-
-    if (read_bytes != num_bytes) {
-        std::cerr << "read error "
-                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
-                  << std::endl;
-    }
-    assert(read_bytes == num_bytes);
-    close(fd);
-    return 0;
-}
-
-static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
-{
-    std::vector<char> regular_buffer;
-    const auto reg_ret = regular_read(filename, regular_buffer);
-    assert(0 == reg_ret);
-    std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
-              << std::endl;
-
-    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
-
-    return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
-}
-
-bool validate_aio_operation(const bool read_op,
-                            const char* filename,
-                            void* aio_buffer,
-                            const long long int num_bytes)
-{
-    const auto msg_suffix = std::string("deepspeed_aio_") +
-                            std::string(read_op ? "read()" : "write()") +
-                            std::string("using read()");
-
-    if (false == _validate_buffer(filename, aio_buffer, num_bytes)) {
-        std::cout << "Fail: correctness of " << msg_suffix << std::endl;
-        return false;
-    }
-
-    std::cout << "Pass: correctness of  " << msg_suffix << std::endl;
-    return true;
-}
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <libaio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "deepspeed_aio_common.h"
+
+using namespace std;
+using namespace std::chrono;
+
+#define DEBUG_DS_AIO_PERF 0
+#define DEBUG_DS_AIO_SUBMIT_PERF 0
+
+static const std::string c_library_name = "deepspeed_aio";
+
+static void _report_aio_statistics(const char* tag,
+                                   const std::vector<std::chrono::duration<double>>& latencies)
+    __attribute__((unused));
+
+static void _report_aio_statistics(const char* tag,
+                                   const std::vector<std::chrono::duration<double>>& latencies)
+{
+    std::vector<double> lat_usec;
+    for (auto& lat : latencies) { lat_usec.push_back(lat.count() * 1e6); }
+    const auto min_lat = *(std::min_element(lat_usec.begin(), lat_usec.end()));
+    const auto max_lat = *(std::max_element(lat_usec.begin(), lat_usec.end()));
+    const auto avg_lat = std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
+
+    std::cout << c_library_name << ": latency statistics(usec) " << tag
+              << " min/max/avg = " << min_lat << " " << max_lat << " " << avg_lat << std::endl;
+}
+
+static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_latencies,
+                               struct deepspeed_aio_latency_t& summary_latencies)
+{
+    std::vector<double> lat_usec;
+    for (auto& lat : raw_latencies) { lat_usec.push_back(lat.count() * 1e6); }
+    summary_latencies._min_usec = *(std::min_element(lat_usec.begin(), lat_usec.end()));
+    summary_latencies._max_usec = *(std::max_element(lat_usec.begin(), lat_usec.end()));
+    summary_latencies._avg_usec =
+        std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
+}
+
+static void _do_io_submit_singles(const long long int n_iocbs,
+                                  const long long int iocb_index,
+                                  std::unique_ptr<aio_context>& aio_ctxt,
+                                  std::vector<std::chrono::duration<double>>& submit_times)
+{
+    for (auto i = 0; i < n_iocbs; ++i) {
+        const auto st = std::chrono::high_resolution_clock::now();
+        const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, 1, aio_ctxt->_iocbs.data() + i);
+        submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
+#if DEBUG_DS_AIO_SUBMIT_PERF
+        printf("submit(usec) %f io_index=%lld buf=%p len=%lu off=%llu \n",
+               submit_times.back().count() * 1e6,
+               iocb_index,
+               aio_ctxt->_iocbs[i]->u.c.buf,
+               aio_ctxt->_iocbs[i]->u.c.nbytes,
+               aio_ctxt->_iocbs[i]->u.c.offset);
+#endif
+        assert(submit_ret > 0);
+    }
+}
+
+static void _do_io_submit_block(const long long int n_iocbs,
+                                const long long int iocb_index,
+                                std::unique_ptr<aio_context>& aio_ctxt,
+                                std::vector<std::chrono::duration<double>>& submit_times)
+{
+    const auto st = std::chrono::high_resolution_clock::now();
+    const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, n_iocbs, aio_ctxt->_iocbs.data());
+    submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
+#if DEBUG_DS_AIO_SUBMIT_PERF
+    printf("submit(usec) %f io_index=%lld nr=%lld buf=%p len=%lu off=%llu \n",
+           submit_times.back().count() * 1e6,
+           iocb_index,
+           n_iocbs,
+           aio_ctxt->_iocbs[0]->u.c.buf,
+           aio_ctxt->_iocbs[0]->u.c.nbytes,
+           aio_ctxt->_iocbs[0]->u.c.offset);
+#endif
+    assert(submit_ret > 0);
+}
+
+static int _do_io_complete(const long long int min_completes,
+                           const long long int max_completes,
+                           std::unique_ptr<aio_context>& aio_ctxt,
+                           std::vector<std::chrono::duration<double>>& reap_times)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    const auto n_completes = io_getevents(
+        aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
+    reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
+
+    assert(n_completes >= min_completes);
+    return n_completes;
+}
+
+void do_aio_operation_sequential(const bool read_op,
+                                 std::unique_ptr<aio_context>& aio_ctxt,
+                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 deepspeed_aio_config_t* config,
+                                 deepspeed_aio_perf_t* perf)
+{
+    struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
+
+    const auto num_io_blocks = static_cast<long long int>(
+        ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
+#if DEBUG_DS_AIO_PERF
+    const auto io_op_name = std::string(read_op ? "read" : "write");
+    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes with " << num_io_blocks << " io blocks" << std::endl;
+#endif
+
+    std::vector<std::chrono::duration<double>> submit_times;
+    std::vector<std::chrono::duration<double>> reap_times;
+    const auto max_queue_bytes =
+        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (long long iocb_index = 0; iocb_index < num_io_blocks;
+         iocb_index += aio_ctxt->_queue_depth) {
+        const auto start_offset = iocb_index * aio_ctxt->_block_size;
+        const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
+        const auto n_iocbs =
+            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
+        const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
+        prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
+
+        if (config->_single_submit) {
+            _do_io_submit_singles(n_iocbs, iocb_index, aio_ctxt, submit_times);
+        } else {
+            _do_io_submit_block(n_iocbs, iocb_index, aio_ctxt, submit_times);
+        }
+
+        _do_io_complete(n_iocbs, n_iocbs, aio_ctxt, reap_times);
+    }
+    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
+
+    if (perf) {
+        _get_aio_latencies(submit_times, perf->_submit);
+        _get_aio_latencies(reap_times, perf->_complete);
+        perf->_e2e_usec = elapsed.count() * 1e6;
+        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
+    }
+
+#if DEBUG_DS_AIO_PERF
+    _report_aio_statistics("submit", submit_times);
+    _report_aio_statistics("complete", reap_times);
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
+              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes " << std::endl;
+#endif
+}
+
+void do_aio_operation_overlap(const bool read_op,
+                              std::unique_ptr<aio_context>& aio_ctxt,
+                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                              deepspeed_aio_config_t* config,
+                              deepspeed_aio_perf_t* perf)
+{
+    struct io_prep_generator io_gen(read_op, xfer_ctxt, aio_ctxt->_block_size);
+
+#if DEBUG_DS_AIO_PERF
+    const auto io_op_name = std::string(read_op ? "read" : "write");
+    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes with " << io_gen._num_io_blocks << " io blocks" << std::endl;
+#endif
+
+    std::vector<std::chrono::duration<double>> submit_times;
+    std::vector<std::chrono::duration<double>> reap_times;
+
+    auto request_iocbs = aio_ctxt->_queue_depth;
+    auto n_pending_iocbs = 0;
+    const auto min_completes = 1;
+    auto start = std::chrono::high_resolution_clock::now();
+    while (true) {
+        const auto n_iocbs = io_gen.prep_iocbs(request_iocbs - n_pending_iocbs, &aio_ctxt->_iocbs);
+        if (n_iocbs > 0) {
+            if (config->_single_submit) {
+                _do_io_submit_singles(
+                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
+            } else {
+                _do_io_submit_block(
+                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
+            }
+        }
+
+        n_pending_iocbs += n_iocbs;
+        assert(n_pending_iocbs <= aio_ctxt->_queue_depth);
+
+        if (n_pending_iocbs == 0) { break; }
+
+        const auto n_complete =
+            _do_io_complete(min_completes, n_pending_iocbs, aio_ctxt, reap_times);
+        n_pending_iocbs -= n_complete;
+    }
+
+    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
+
+    if (perf) {
+        _get_aio_latencies(submit_times, perf->_submit);
+        _get_aio_latencies(reap_times, perf->_complete);
+        perf->_e2e_usec = elapsed.count() * 1e6;
+        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
+    }
+
+#if DEBUG_DS_AIO_PERF
+    _report_aio_statistics("submit", submit_times);
+    _report_aio_statistics("complete", reap_times);
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
+              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes " << std::endl;
+#endif
+}
+
+void report_file_error(const char* filename, const std::string file_op, const int error_code)
+{
+    std::string err_msg = file_op + std::string(" failed on ") + std::string(filename) +
+                          " error = " + std::to_string(error_code);
+    std::cerr << c_library_name << ":  " << err_msg << std::endl;
+}
+
+int open_file(const char* filename, const bool read_op)
+{
+    const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
+    const int mode = 0600;
+    const auto fd = open(filename, flags, mode);
+    if (fd == -1) {
+        const auto error_code = errno;
+        const auto error_msg = read_op ? " open for read " : " open for write ";
+        report_file_error(filename, error_msg, error_code);
+        return -1;
+    }
+    return fd;
+}
+
+int regular_read(const char* filename, std::vector<char>& buffer)
+{
+    long long int num_bytes;
+    const auto f_size = get_file_size(filename, num_bytes);
+    assert(f_size != -1);
+    buffer.resize(num_bytes);
+    const auto fd = open(filename, O_RDONLY, 0600);
+    assert(fd != -1);
+    long long int read_bytes = 0;
+    auto r = 0;
+    do {
+        const auto buffer_ptr = buffer.data() + read_bytes;
+        const auto bytes_to_read = num_bytes - read_bytes;
+        r = read(fd, buffer_ptr, bytes_to_read);
+        read_bytes += r;
+    } while (r > 0);
+
+    if (read_bytes != num_bytes) {
+        std::cerr << "read error "
+                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
+                  << std::endl;
+    }
+    assert(read_bytes == num_bytes);
+    close(fd);
+    return 0;
+}
+
+static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
+{
+    std::vector<char> regular_buffer;
+    const auto reg_ret = regular_read(filename, regular_buffer);
+    assert(0 == reg_ret);
+    std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
+              << std::endl;
+
+    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
+
+    return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
+}
+
+bool validate_aio_operation(const bool read_op,
+                            const char* filename,
+                            void* aio_buffer,
+                            const long long int num_bytes)
+{
+    const auto msg_suffix = std::string("deepspeed_aio_") +
+                            std::string(read_op ? "read()" : "write()") +
+                            std::string("using read()");
+
+    if (false == _validate_buffer(filename, aio_buffer, num_bytes)) {
+        std::cout << "Fail: correctness of " << msg_suffix << std::endl;
+        return false;
+    }
+
+    std::cout << "Pass: correctness of  " << msg_suffix << std::endl;
+    return true;
+}
--- a/csrc/aio/common/deepspeed_aio_common.h
+++ b/csrc/aio/common/deepspeed_aio_common.h
@ -1,36 +1,36 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <deepspeed_aio_utils.h>
-#include <stdlib.h>
-#include <memory>
-#include <string>
-
-using namespace std;
-
-void do_aio_operation_sequential(const bool read_op,
-                                 std::unique_ptr<aio_context>& aio_ctxt,
-                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                 deepspeed_aio_config_t* config,
-                                 deepspeed_aio_perf_t* perf);
-
-void do_aio_operation_overlap(const bool read_op,
-                              std::unique_ptr<aio_context>& aio_ctxt,
-                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                              deepspeed_aio_config_t* config,
-                              deepspeed_aio_perf_t* perf);
-
-int open_file(const char* filename, const bool read_op);
-
-void report_file_error(const char* filename, const std::string file_op, const int error_code);
-
-int regular_read(const char* filename, std::vector<char>& buffer);
-
-bool validate_aio_operation(const bool read_op,
-                            const char* filename,
-                            void* aio_buffer,
-                            const long long int num_bytes);
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <deepspeed_aio_utils.h>
+#include <stdlib.h>
+#include <memory>
+#include <string>
+
+using namespace std;
+
+void do_aio_operation_sequential(const bool read_op,
+                                 std::unique_ptr<aio_context>& aio_ctxt,
+                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 deepspeed_aio_config_t* config,
+                                 deepspeed_aio_perf_t* perf);
+
+void do_aio_operation_overlap(const bool read_op,
+                              std::unique_ptr<aio_context>& aio_ctxt,
+                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                              deepspeed_aio_config_t* config,
+                              deepspeed_aio_perf_t* perf);
+
+int open_file(const char* filename, const bool read_op);
+
+void report_file_error(const char* filename, const std::string file_op, const int error_code);
+
+int regular_read(const char* filename, std::vector<char>& buffer);
+
+bool validate_aio_operation(const bool read_op,
+                            const char* filename,
+                            void* aio_buffer,
+                            const long long int num_bytes);
--- a/csrc/aio/common/deepspeed_aio_types.cpp
+++ b/csrc/aio/common/deepspeed_aio_types.cpp
@ -1,74 +1,74 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <cmath>
-
-#include "deepspeed_aio_utils.h"
-
-using namespace std;
-
-const int c_block_size = 128 * 1024;
-const int c_io_queue_depth = 8;
-
-deepspeed_aio_config_t::deepspeed_aio_config_t()
-    : _block_size(c_block_size),
-      _queue_depth(c_io_queue_depth),
-      _single_submit(false),
-      _overlap_events(false),
-      _lock_memory(false)
-{
-}
-
-deepspeed_aio_config_t::deepspeed_aio_config_t(const int block_size,
-                                               const int queue_depth,
-                                               const bool single_submit,
-                                               const bool overlap_events,
-                                               const bool lock_memory)
-    : _block_size(block_size),
-      _queue_depth(queue_depth),
-      _single_submit(single_submit),
-      _overlap_events(overlap_events),
-      _lock_memory(lock_memory)
-{
-}
-
-void deepspeed_aio_latency_t::dump(const std::string tag)
-{
-    std::cout << tag << _min_usec << " " << _max_usec << " " << _avg_usec << " " << std::endl;
-}
-
-void deepspeed_aio_latency_t::accumulate(const struct deepspeed_aio_latency_t& other)
-{
-    _min_usec += other._min_usec;
-    _max_usec += other._max_usec;
-    _avg_usec += other._avg_usec;
-}
-
-void deepspeed_aio_latency_t::scale(const float scaler)
-{
-    _min_usec *= scaler;
-    _max_usec *= scaler;
-    _avg_usec *= scaler;
-}
-
-aio_context::aio_context(const int block_size, const int queue_depth)
-{
-    _block_size = block_size;
-    _queue_depth = queue_depth;
-    for (auto i = 0; i < queue_depth; ++i) {
-        _iocbs.push_back((struct iocb*)calloc(1, sizeof(struct iocb)));
-    }
-    _io_events.resize(queue_depth);
-    io_queue_init(queue_depth, &_io_ctxt);
-}
-
-aio_context::~aio_context()
-{
-    for (auto& iocb : _iocbs) { free(iocb); }
-    _io_events.resize(0);
-    io_queue_release(_io_ctxt);
-}
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cmath>
+
+#include "deepspeed_aio_utils.h"
+
+using namespace std;
+
+const int c_block_size = 128 * 1024;
+const int c_io_queue_depth = 8;
+
+deepspeed_aio_config_t::deepspeed_aio_config_t()
+    : _block_size(c_block_size),
+      _queue_depth(c_io_queue_depth),
+      _single_submit(false),
+      _overlap_events(false),
+      _lock_memory(false)
+{
+}
+
+deepspeed_aio_config_t::deepspeed_aio_config_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const bool lock_memory)
+    : _block_size(block_size),
+      _queue_depth(queue_depth),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _lock_memory(lock_memory)
+{
+}
+
+void deepspeed_aio_latency_t::dump(const std::string tag)
+{
+    std::cout << tag << _min_usec << " " << _max_usec << " " << _avg_usec << " " << std::endl;
+}
+
+void deepspeed_aio_latency_t::accumulate(const struct deepspeed_aio_latency_t& other)
+{
+    _min_usec += other._min_usec;
+    _max_usec += other._max_usec;
+    _avg_usec += other._avg_usec;
+}
+
+void deepspeed_aio_latency_t::scale(const float scaler)
+{
+    _min_usec *= scaler;
+    _max_usec *= scaler;
+    _avg_usec *= scaler;
+}
+
+aio_context::aio_context(const int block_size, const int queue_depth)
+{
+    _block_size = block_size;
+    _queue_depth = queue_depth;
+    for (auto i = 0; i < queue_depth; ++i) {
+        _iocbs.push_back((struct iocb*)calloc(1, sizeof(struct iocb)));
+    }
+    _io_events.resize(queue_depth);
+    io_queue_init(queue_depth, &_io_ctxt);
+}
+
+aio_context::~aio_context()
+{
+    for (auto& iocb : _iocbs) { free(iocb); }
+    _io_events.resize(0);
+    io_queue_release(_io_ctxt);
+}
--- a/csrc/aio/common/deepspeed_aio_types.h
+++ b/csrc/aio/common/deepspeed_aio_types.h
@ -1,57 +1,57 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <libaio.h>
-#include <stdlib.h>
-
-#include <string>
-#include <vector>
-
-using namespace std;
-
-struct deepspeed_aio_latency_t {
-    double _min_usec;
-    double _max_usec;
-    double _avg_usec;
-
-    void dump(const std::string tag);
-    void accumulate(const deepspeed_aio_latency_t&);
-    void scale(const float value);
-};
-
-struct deepspeed_aio_perf_t {
-    deepspeed_aio_latency_t _submit;
-    deepspeed_aio_latency_t _complete;
-    double _e2e_usec;
-    double _e2e_rate_GB;
-};
-
-struct deepspeed_aio_config_t {
-    const int _block_size;
-    const int _queue_depth;
-    const bool _single_submit;
-    const bool _overlap_events;
-    const bool _lock_memory;
-
-    deepspeed_aio_config_t();
-    deepspeed_aio_config_t(const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const bool lock_memory);
-};
-
-struct aio_context {
-    io_context_t _io_ctxt;
-    std::vector<struct io_event> _io_events;
-    std::vector<struct iocb*> _iocbs;
-    int _block_size;
-    int _queue_depth;
-
-    aio_context(const int block_size, const int queue_depth);
-    ~aio_context();
-};
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <libaio.h>
+#include <stdlib.h>
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+struct deepspeed_aio_latency_t {
+    double _min_usec;
+    double _max_usec;
+    double _avg_usec;
+
+    void dump(const std::string tag);
+    void accumulate(const deepspeed_aio_latency_t&);
+    void scale(const float value);
+};
+
+struct deepspeed_aio_perf_t {
+    deepspeed_aio_latency_t _submit;
+    deepspeed_aio_latency_t _complete;
+    double _e2e_usec;
+    double _e2e_rate_GB;
+};
+
+struct deepspeed_aio_config_t {
+    const int _block_size;
+    const int _queue_depth;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const bool _lock_memory;
+
+    deepspeed_aio_config_t();
+    deepspeed_aio_config_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool lock_memory);
+};
+
+struct aio_context {
+    io_context_t _io_ctxt;
+    std::vector<struct io_event> _io_events;
+    std::vector<struct iocb*> _iocbs;
+    int _block_size;
+    int _queue_depth;
+
+    aio_context(const int block_size, const int queue_depth);
+    ~aio_context();
+};
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@ -1,123 +1,123 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <cmath>
-
-#include "deepspeed_aio_utils.h"
-
-using namespace std;
-
-const int c_block_size = 128 * 1024;
-const int c_io_queue_depth = 8;
-
-io_xfer_ctxt::io_xfer_ctxt(const int fd,
-                           const long long int file_offset,
-                           const long long int num_bytes,
-                           const void* buffer)
-    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
-{
-}
-
-io_prep_context::io_prep_context(const bool read_op,
-                                 const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                 const size_t block_size,
-                                 const std::vector<struct iocb*>* iocbs)
-    : _read_op(read_op), _xfer_ctxt(xfer_ctxt), _block_size(block_size), _iocbs(iocbs)
-{
-}
-
-void io_prep_context::prep_iocbs(const int n_iocbs,
-                                 const size_t num_bytes,
-                                 const void* start_buffer,
-                                 const long long int start_offset)
-{
-    assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
-    for (auto i = 0; i < n_iocbs; ++i) {
-        const auto shift = i * _block_size;
-        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
-        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
-        auto byte_count = _block_size;
-        if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
-
-        if (_read_op) {
-            io_prep_pread(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
-        } else {
-            io_prep_pwrite(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
-        }
-    }
-}
-
-io_prep_generator::io_prep_generator(const bool read_op,
-                                     const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                     const size_t block_size)
-    : _read_op(read_op),
-      _xfer_ctxt(xfer_ctxt),
-      _block_size(block_size),
-      _remaining_bytes(xfer_ctxt->_num_bytes),
-      _next_iocb_index(0)
-{
-    _num_io_blocks =
-        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
-    _remaining_io_blocks = _num_io_blocks;
-}
-
-int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
-{
-    if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
-        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
-        return 0;
-    }
-
-    assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
-
-    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
-    for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
-        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
-        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
-        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
-
-        if (_read_op) {
-            io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
-        } else {
-            io_prep_pwrite(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
-        }
-        _remaining_bytes -= num_bytes;
-    }
-    _remaining_io_blocks -= actual_n_iocbs;
-
-    return actual_n_iocbs;
-}
-
-int get_file_size(const char* filename, long long int& size)
-{
-    struct stat st;
-    if (stat(filename, &st) == -1) { return -1; }
-    size = st.st_size;
-    return 0;
-}
-
-void* ds_page_aligned_alloc(const size_t size, const bool lock)
-{
-    void* ptr;
-    int retval;
-
-    retval = posix_memalign(&ptr, (size_t)sysconf(_SC_PAGESIZE), size);
-    if (retval) { return nullptr; }
-
-    if (lock == false) { return ptr; }
-
-    auto mlock_ret = mlock(ptr, size);
-    if (mlock_ret != 0) {
-        auto mlock_error = errno;
-        printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
-
-        free(ptr);
-        return nullptr;
-    }
-
-    return ptr;
-}
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cmath>
+
+#include "deepspeed_aio_utils.h"
+
+using namespace std;
+
+const int c_block_size = 128 * 1024;
+const int c_io_queue_depth = 8;
+
+io_xfer_ctxt::io_xfer_ctxt(const int fd,
+                           const long long int file_offset,
+                           const long long int num_bytes,
+                           const void* buffer)
+    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
+{
+}
+
+io_prep_context::io_prep_context(const bool read_op,
+                                 const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 const size_t block_size,
+                                 const std::vector<struct iocb*>* iocbs)
+    : _read_op(read_op), _xfer_ctxt(xfer_ctxt), _block_size(block_size), _iocbs(iocbs)
+{
+}
+
+void io_prep_context::prep_iocbs(const int n_iocbs,
+                                 const size_t num_bytes,
+                                 const void* start_buffer,
+                                 const long long int start_offset)
+{
+    assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
+    for (auto i = 0; i < n_iocbs; ++i) {
+        const auto shift = i * _block_size;
+        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
+        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
+        auto byte_count = _block_size;
+        if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
+
+        if (_read_op) {
+            io_prep_pread(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
+        } else {
+            io_prep_pwrite(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
+        }
+    }
+}
+
+io_prep_generator::io_prep_generator(const bool read_op,
+                                     const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                     const size_t block_size)
+    : _read_op(read_op),
+      _xfer_ctxt(xfer_ctxt),
+      _block_size(block_size),
+      _remaining_bytes(xfer_ctxt->_num_bytes),
+      _next_iocb_index(0)
+{
+    _num_io_blocks =
+        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
+    _remaining_io_blocks = _num_io_blocks;
+}
+
+int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
+{
+    if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
+        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
+        return 0;
+    }
+
+    assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
+
+    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
+    for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
+        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
+        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
+        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
+
+        if (_read_op) {
+            io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
+        } else {
+            io_prep_pwrite(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
+        }
+        _remaining_bytes -= num_bytes;
+    }
+    _remaining_io_blocks -= actual_n_iocbs;
+
+    return actual_n_iocbs;
+}
+
+int get_file_size(const char* filename, long long int& size)
+{
+    struct stat st;
+    if (stat(filename, &st) == -1) { return -1; }
+    size = st.st_size;
+    return 0;
+}
+
+void* ds_page_aligned_alloc(const size_t size, const bool lock)
+{
+    void* ptr;
+    int retval;
+
+    retval = posix_memalign(&ptr, (size_t)sysconf(_SC_PAGESIZE), size);
+    if (retval) { return nullptr; }
+
+    if (lock == false) { return ptr; }
+
+    auto mlock_ret = mlock(ptr, size);
+    if (mlock_ret != 0) {
+        auto mlock_error = errno;
+        printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
+
+        free(ptr);
+        return nullptr;
+    }
+
+    return ptr;
+}
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@ -1,77 +1,77 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#pragma once
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <libaio.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <deepspeed_aio_types.h>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-
-struct io_xfer_ctxt {
-    const int _fd;
-    const long long int _base_offset;
-    const void* _mem_buffer;
-    const long long int _num_bytes;
-
-    io_xfer_ctxt(const int fd,
-                 const long long int file_offset,
-                 const long long int num_bytes,
-                 const void* buffer);
-};
-
-struct io_prep_context {
-    const bool _read_op;
-    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
-    const size_t _block_size;
-    const std::vector<struct iocb*>* _iocbs;
-
-    io_prep_context(const bool read_op,
-                    const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                    const size_t block_size,
-                    const std::vector<struct iocb*>* iocbs);
-
-    void prep_iocbs(const int n_iocbs,
-                    const size_t num_bytes,
-                    const void* start_buffer,
-                    const long long int start_offset);
-};
-
-struct io_prep_generator {
-    const bool _read_op;
-    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
-    const size_t _block_size;
-
-    long long int _remaining_bytes;
-    long long int _num_io_blocks;
-    long long int _remaining_io_blocks;
-    long long int _next_iocb_index;
-
-    io_prep_generator(const bool read_op,
-                      const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                      const size_t block_size);
-
-    int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
-};
-
-void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
-
-int get_file_size(const char* filename, long long int& size);
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#pragma once
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <libaio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <deepspeed_aio_types.h>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+struct io_xfer_ctxt {
+    const int _fd;
+    const long long int _base_offset;
+    const void* _mem_buffer;
+    const long long int _num_bytes;
+
+    io_xfer_ctxt(const int fd,
+                 const long long int file_offset,
+                 const long long int num_bytes,
+                 const void* buffer);
+};
+
+struct io_prep_context {
+    const bool _read_op;
+    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
+    const size_t _block_size;
+    const std::vector<struct iocb*>* _iocbs;
+
+    io_prep_context(const bool read_op,
+                    const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                    const size_t block_size,
+                    const std::vector<struct iocb*>* iocbs);
+
+    void prep_iocbs(const int n_iocbs,
+                    const size_t num_bytes,
+                    const void* start_buffer,
+                    const long long int start_offset);
+};
+
+struct io_prep_generator {
+    const bool _read_op;
+    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
+    const size_t _block_size;
+
+    long long int _remaining_bytes;
+    long long int _num_io_blocks;
+    long long int _remaining_io_blocks;
+    long long int _next_iocb_index;
+
+    io_prep_generator(const bool read_op,
+                      const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                      const size_t block_size);
+
+    int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
+};
+
+void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
+
+int get_file_size(const char* filename, long long int& size);
--- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@ -1,84 +1,84 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include "deepspeed_aio_thread.h"
-
-using namespace std;
-
-io_op_desc_t::io_op_desc_t(const bool read_op,
-                           const torch::Tensor& buffer,
-                           const int fd,
-                           const char* filename,
-                           const long long int num_bytes,
-                           const bool validate)
-    : _read_op(read_op),
-      _buffer(buffer),
-      _fd(fd),
-      _filename(filename),
-      _num_bytes(num_bytes),
-      _validate(validate)
-{
-    _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
-    _contiguous_buffer = _cpu_buffer.contiguous();
-}
-
-char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
-
-void io_op_desc_t::fini()
-{
-    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
-}
-
-deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
-    : _tid(tid),
-      _aio_config(aio_config),
-      _aio_ctxt(new aio_context(aio_config._block_size, aio_config._queue_depth)),
-      _time_to_exit(false)
-{
-}
-
-deepspeed_aio_thread_t::~deepspeed_aio_thread_t() {}
-
-void deepspeed_aio_thread_t::run()
-{
-    while (true) {
-        std::shared_ptr<struct io_op_desc_t> next_io_op = nullptr;
-
-        {
-            std::unique_lock<std::mutex> lock(_work_sync._mutex);
-            _work_sync._cond_var.wait(lock,
-                                      [this] { return (!_work_queue.empty() || _time_to_exit); });
-            if (!_work_queue.empty()) {
-                next_io_op = _work_queue.front();
-                _work_queue.pop();
-            }
-        }
-
-        if (next_io_op) {
-            const auto base_offset = next_io_op->_num_bytes * _tid;
-
-            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
-                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
-
-            if (_aio_config._overlap_events) {
-                do_aio_operation_overlap(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            } else {
-                do_aio_operation_sequential(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            }
-
-            {
-                std::lock_guard<std::mutex> lock(_complete_sync._mutex);
-                _complete_queue.push(next_io_op);
-            }
-            _complete_sync._cond_var.notify_one();
-        }
-
-        if (_time_to_exit) { break; }
-    }
-}
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_aio_thread.h"
+
+using namespace std;
+
+io_op_desc_t::io_op_desc_t(const bool read_op,
+                           const torch::Tensor& buffer,
+                           const int fd,
+                           const char* filename,
+                           const long long int num_bytes,
+                           const bool validate)
+    : _read_op(read_op),
+      _buffer(buffer),
+      _fd(fd),
+      _filename(filename),
+      _num_bytes(num_bytes),
+      _validate(validate)
+{
+    _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
+    _contiguous_buffer = _cpu_buffer.contiguous();
+}
+
+char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void io_op_desc_t::fini()
+{
+    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+}
+
+deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
+    : _tid(tid),
+      _aio_config(aio_config),
+      _aio_ctxt(new aio_context(aio_config._block_size, aio_config._queue_depth)),
+      _time_to_exit(false)
+{
+}
+
+deepspeed_aio_thread_t::~deepspeed_aio_thread_t() {}
+
+void deepspeed_aio_thread_t::run()
+{
+    while (true) {
+        std::shared_ptr<struct io_op_desc_t> next_io_op = nullptr;
+
+        {
+            std::unique_lock<std::mutex> lock(_work_sync._mutex);
+            _work_sync._cond_var.wait(lock,
+                                      [this] { return (!_work_queue.empty() || _time_to_exit); });
+            if (!_work_queue.empty()) {
+                next_io_op = _work_queue.front();
+                _work_queue.pop();
+            }
+        }
+
+        if (next_io_op) {
+            const auto base_offset = next_io_op->_num_bytes * _tid;
+
+            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
+                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
+
+            if (_aio_config._overlap_events) {
+                do_aio_operation_overlap(
+                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+            } else {
+                do_aio_operation_sequential(
+                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+            }
+
+            {
+                std::lock_guard<std::mutex> lock(_complete_sync._mutex);
+                _complete_queue.push(next_io_op);
+            }
+            _complete_sync._cond_var.notify_one();
+        }
+
+        if (_time_to_exit) { break; }
+    }
+}
--- a/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@ -1,57 +1,57 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <condition_variable>
-#include <memory>
-#include <queue>
-#include "deepspeed_py_aio.h"
-
-struct io_op_desc_t {
-    const bool _read_op;
-    torch::Tensor _buffer;
-    int _fd;
-    const std::string _filename;
-    const long long int _num_bytes;
-    torch::Tensor _cpu_buffer;
-    torch::Tensor _contiguous_buffer;
-    const bool _validate;
-
-    io_op_desc_t(const bool read_op,
-                 const torch::Tensor& buffer,
-                 const int fd,
-                 const char* filename,
-                 const long long int num_bytes,
-                 const bool validate);
-
-    char* data_ptr() const;
-    void fini();
-};
-
-struct thread_sync_t {
-    std::mutex _mutex;
-    std::condition_variable _cond_var;
-};
-
-struct deepspeed_aio_thread_t {
-    const int _tid;
-    deepspeed_aio_config_t& _aio_config;
-
-    std::unique_ptr<struct aio_context> _aio_ctxt;
-    std::queue<std::shared_ptr<struct io_op_desc_t>> _work_queue;
-    std::queue<std::shared_ptr<struct io_op_desc_t>> _complete_queue;
-
-    bool _time_to_exit;
-
-    struct thread_sync_t _work_sync;
-    struct thread_sync_t _complete_sync;
-
-    deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config);
-
-    ~deepspeed_aio_thread_t();
-
-    void run();
-};
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include <queue>
+#include "deepspeed_py_aio.h"
+
+struct io_op_desc_t {
+    const bool _read_op;
+    torch::Tensor _buffer;
+    int _fd;
+    const std::string _filename;
+    const long long int _num_bytes;
+    torch::Tensor _cpu_buffer;
+    torch::Tensor _contiguous_buffer;
+    const bool _validate;
+
+    io_op_desc_t(const bool read_op,
+                 const torch::Tensor& buffer,
+                 const int fd,
+                 const char* filename,
+                 const long long int num_bytes,
+                 const bool validate);
+
+    char* data_ptr() const;
+    void fini();
+};
+
+struct thread_sync_t {
+    std::mutex _mutex;
+    std::condition_variable _cond_var;
+};
+
+struct deepspeed_aio_thread_t {
+    const int _tid;
+    deepspeed_aio_config_t& _aio_config;
+
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    std::queue<std::shared_ptr<struct io_op_desc_t>> _work_queue;
+    std::queue<std::shared_ptr<struct io_op_desc_t>> _complete_queue;
+
+    bool _time_to_exit;
+
+    struct thread_sync_t _work_sync;
+    struct thread_sync_t _complete_sync;
+
+    deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config);
+
+    ~deepspeed_aio_thread_t();
+
+    void run();
+};
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@ -1,121 +1,121 @@
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <cassert>
-#include <chrono>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "deepspeed_py_aio.h"
-
-using namespace std;
-using namespace std::chrono;
-
-#define DEBUG_DS_AIO_READ 0
-#define DEBUG_DS_AIO_WRITE 0
-
-static const std::string c_library_name = "deepspeed_aio";
-
-int deepspeed_py_aio_write(const torch::Tensor& buffer,
-                           const char* filename,
-                           const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
-    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
-
-    if (config._overlap_events) {
-        do_aio_operation_overlap(false, aio_ctxt, xfer_ctxt, &config, nullptr);
-    } else {
-        do_aio_operation_sequential(false, aio_ctxt, xfer_ctxt, &config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-int deepspeed_py_aio_read(torch::Tensor& buffer,
-                          const char* filename,
-                          const int block_size,
-                          const int queue_depth,
-                          const bool single_submit,
-                          const bool overlap_events,
-                          const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-
-    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto read_buffer = (char*)buffer.data_ptr();
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
-    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
-
-    if (config._overlap_events) {
-        do_aio_operation_overlap(true, aio_ctxt, xfer_ctxt, &config, nullptr);
-    } else {
-        do_aio_operation_sequential(true, aio_ctxt, xfer_ctxt, &config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "deepspeed_py_aio.h"
+
+using namespace std;
+using namespace std::chrono;
+
+#define DEBUG_DS_AIO_READ 0
+#define DEBUG_DS_AIO_WRITE 0
+
+static const std::string c_library_name = "deepspeed_aio";
+
+int deepspeed_py_aio_write(const torch::Tensor& buffer,
+                           const char* filename,
+                           const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
+
+    if (config._overlap_events) {
+        do_aio_operation_overlap(false, aio_ctxt, xfer_ctxt, &config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, aio_ctxt, xfer_ctxt, &config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+int deepspeed_py_aio_read(torch::Tensor& buffer,
+                          const char* filename,
+                          const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+
+    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
+
+    if (config._overlap_events) {
+        do_aio_operation_overlap(true, aio_ctxt, xfer_ctxt, &config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, aio_ctxt, xfer_ctxt, &config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
--- a/csrc/aio/py_lib/deepspeed_py_aio.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
@ -1,27 +1,27 @@
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <deepspeed_aio_common.h>
-#include <stdlib.h>
-#include <torch/extension.h>
-
-int deepspeed_py_aio_write(const torch::Tensor& buffer,
-                           const char* filename,
-                           const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const bool validate);
-
-int deepspeed_py_aio_read(torch::Tensor& buffer,
-                          const char* filename,
-                          const int block_size,
-                          const int queue_depth,
-                          const bool single_submit,
-                          const bool overlap_events,
-                          const bool validate);
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <deepspeed_aio_common.h>
+#include <stdlib.h>
+#include <torch/extension.h>
+
+int deepspeed_py_aio_write(const torch::Tensor& buffer,
+                           const char* filename,
+                           const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool validate);
+
+int deepspeed_py_aio_read(torch::Tensor& buffer,
+                          const char* filename,
+                          const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const bool validate);
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@ -1,282 +1,282 @@
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include "deepspeed_py_aio_handle.h"
-
-using namespace std;
-
-static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
-
-deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
-                                               const int queue_depth,
-                                               const bool single_submit,
-                                               const bool overlap_events,
-                                               const int num_threads)
-    : _aio_ctxt(new aio_context(block_size, queue_depth)),
-      _single_submit(single_submit),
-      _overlap_events(overlap_events),
-      _num_threads(num_threads),
-      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
-      _num_pending_ops(0)
-{
-    for (auto i = 0; i < num_threads; ++i) {
-        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
-    }
-
-    for (auto& ctxt : _thread_contexts) {
-        _threads.push_back(std::thread(_start_aio_thread, ctxt));
-    }
-}
-
-deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
-{
-    _stop_threads();
-    for (auto& thr : _threads) { thr.join(); }
-}
-
-const int deepspeed_aio_handle_t::get_block_size() const
-{
-    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
-}
-
-const int deepspeed_aio_handle_t::get_queue_depth() const
-{
-    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
-}
-
-const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
-
-const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
-
-const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
-
-int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    assert(_aio_ctxt);
-
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto read_buffer = (char*)buffer.data_ptr();
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-
-    close(fd);
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate)
-{
-    assert(_aio_ctxt);
-
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
-{
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_work_queue.push(scheduled_op);
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-    _num_pending_ops++;
-}
-
-std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
-{
-    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
-    for (auto& ctxt : _thread_contexts) {
-        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
-        ctxt->_complete_sync._cond_var.wait(lock,
-                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
-        completed_op = ctxt->_complete_queue.front();
-        ctxt->_complete_queue.pop();
-    }
-    return completed_op;
-}
-
-void deepspeed_aio_handle_t::_stop_threads()
-{
-    assert(0 == _num_pending_ops);
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_time_to_exit = true;
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-}
-
-int deepspeed_aio_handle_t::wait()
-{
-    assert(_num_pending_ops > 0);
-    auto num_completed_ops = 0;
-
-    while (_num_pending_ops > 0) {
-        auto completed_op = _wait_for_aio_work();
-
-        completed_op->fini();
-
-        close(completed_op->_fd);
-
-        if (completed_op->_validate) {
-            validate_aio_operation(completed_op->_read_op,
-                                   completed_op->_filename.c_str(),
-                                   completed_op->data_ptr(),
-                                   _num_threads * completed_op->_num_bytes);
-        }
-        --_num_pending_ops;
-        ++num_completed_ops;
-    }
-
-    return num_completed_ops;
-}
-
-bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
-                                                       const long long int num_bytes)
-{
-    const auto op_string = read_op ? "Read" : "Write";
-    if (num_bytes % get_thread_count()) {
-        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
-                  << " not divisible by thread count = " << get_thread_count() << std::endl;
-        return false;
-    }
-
-    return true;
-}
-
-int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate,
-                                  const bool async)
-{
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
-    if (buffer_bytes != num_file_bytes) {
-        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
-                  << " != " << num_file_bytes << std::endl;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-    assert((num_file_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
-                                   const char* filename,
-                                   const bool validate,
-                                   const bool async)
-{
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    assert((num_write_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, true);
-}
-
-int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, true);
-}
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_aio_handle.h"
+
+using namespace std;
+
+static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
+
+deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const int num_threads)
+    : _aio_ctxt(new aio_context(block_size, queue_depth)),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _num_threads(num_threads),
+      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
+      _num_pending_ops(0)
+{
+    for (auto i = 0; i < num_threads; ++i) {
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
+    }
+
+    for (auto& ctxt : _thread_contexts) {
+        _threads.push_back(std::thread(_start_aio_thread, ctxt));
+    }
+}
+
+deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
+{
+    _stop_threads();
+    for (auto& thr : _threads) { thr.join(); }
+}
+
+const int deepspeed_aio_handle_t::get_block_size() const
+{
+    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
+}
+
+const int deepspeed_aio_handle_t::get_queue_depth() const
+{
+    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
+}
+
+const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
+
+const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
+
+const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
+
+int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    assert(_aio_ctxt);
+
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+
+    close(fd);
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate)
+{
+    assert(_aio_ctxt);
+
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
+{
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_work_queue.push(scheduled_op);
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+    _num_pending_ops++;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
+{
+    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
+    for (auto& ctxt : _thread_contexts) {
+        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
+        ctxt->_complete_sync._cond_var.wait(lock,
+                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
+        completed_op = ctxt->_complete_queue.front();
+        ctxt->_complete_queue.pop();
+    }
+    return completed_op;
+}
+
+void deepspeed_aio_handle_t::_stop_threads()
+{
+    assert(0 == _num_pending_ops);
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_time_to_exit = true;
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+}
+
+int deepspeed_aio_handle_t::wait()
+{
+    assert(_num_pending_ops > 0);
+    auto num_completed_ops = 0;
+
+    while (_num_pending_ops > 0) {
+        auto completed_op = _wait_for_aio_work();
+
+        completed_op->fini();
+
+        close(completed_op->_fd);
+
+        if (completed_op->_validate) {
+            validate_aio_operation(completed_op->_read_op,
+                                   completed_op->_filename.c_str(),
+                                   completed_op->data_ptr(),
+                                   _num_threads * completed_op->_num_bytes);
+        }
+        --_num_pending_ops;
+        ++num_completed_ops;
+    }
+
+    return num_completed_ops;
+}
+
+bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
+                                                       const long long int num_bytes)
+{
+    const auto op_string = read_op ? "Read" : "Write";
+    if (num_bytes % get_thread_count()) {
+        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+                  << " not divisible by thread count = " << get_thread_count() << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate,
+                                  const bool async)
+{
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
+    if (buffer_bytes != num_file_bytes) {
+        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
+                  << " != " << num_file_bytes << std::endl;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert((num_file_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = std::make_shared<io_op_desc_t>(
+        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
+                                   const char* filename,
+                                   const bool validate,
+                                   const bool async)
+{
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    assert((num_write_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = std::make_shared<io_op_desc_t>(
+        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, false);
+}
+
+int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, false);
+}
+
+int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, true);
+}
+
+int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, true);
+}
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@ -1,68 +1,68 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <condition_variable>
-#include <memory>
-#include "deepspeed_aio_thread.h"
-
-struct deepspeed_aio_handle_t {
-    std::unique_ptr<struct aio_context> _aio_ctxt;
-    const bool _single_submit;
-    const bool _overlap_events;
-    const int _num_threads;
-    deepspeed_aio_config_t _aio_config;
-
-    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
-    std::vector<std::thread> _threads;
-    int _num_pending_ops;
-
-    deepspeed_aio_handle_t(const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const int num_threads);
-
-    ~deepspeed_aio_handle_t();
-
-    const int get_block_size() const;
-    const int get_queue_depth() const;
-    const bool get_single_submit() const;
-    const bool get_overlap_events() const;
-    const int get_thread_count() const;
-
-    int read(torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int pread(const torch::Tensor& buffer,
-              const char* filename,
-              const bool validate,
-              const bool async);
-
-    int pwrite(const torch::Tensor& buffer,
-               const char* filename,
-               const bool validate,
-               const bool async);
-
-    int sync_pread(torch::Tensor& buffer, const char* filename);
-
-    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    int async_pread(torch::Tensor& buffer, const char* filename);
-
-    int async_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    int wait();
-
-    void _stop_threads();
-
-    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
-
-    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
-
-    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
-};
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_aio_thread.h"
+
+struct deepspeed_aio_handle_t {
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const int _num_threads;
+    deepspeed_aio_config_t _aio_config;
+
+    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
+    std::vector<std::thread> _threads;
+    int _num_pending_ops;
+
+    deepspeed_aio_handle_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const int num_threads);
+
+    ~deepspeed_aio_handle_t();
+
+    const int get_block_size() const;
+    const int get_queue_depth() const;
+    const bool get_single_submit() const;
+    const bool get_overlap_events() const;
+    const int get_thread_count() const;
+
+    int read(torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int pread(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const bool async);
+
+    int pwrite(const torch::Tensor& buffer,
+               const char* filename,
+               const bool validate,
+               const bool async);
+
+    int sync_pread(torch::Tensor& buffer, const char* filename);
+
+    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int async_pread(torch::Tensor& buffer, const char* filename);
+
+    int async_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int wait();
+
+    void _stop_threads();
+
+    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
+
+    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
+
+    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
+};
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@ -1,133 +1,133 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include "deepspeed_py_copy.h"
-#include <omp.h>
-
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
-
-#if defined(__AVX512__) or defined(__AVX256__)
-union AVX_Data {
-#if defined(__AVX512__)
-    __m512 data;
-#else
-    __m256 data;
-#endif
-};
-#endif
-
-static void helper_memcpy_1(float* dest, float* src, size_t param_size)
-{
-    size_t rounded_size = 0;
-
-#if defined(__AVX512__) or defined(__AVX256__)
-
-    rounded_size = ROUND_DOWN(param_size, SIMD_WIDTH);
-
-    for (size_t t = 0; t < rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
-        size_t offset = copy_size + t;
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
-            AVX_Data src_4;
-            src_4.data = SIMD_LOAD(src + i);
-
-            SIMD_STORE(dest + i, src_4.data);
-        }
-    }
-
-#endif
-
-    if (param_size > rounded_size) {
-#pragma omp parallel for
-        for (size_t k = rounded_size; k < param_size; k++) { dest[k] = src[k]; }
-    }
-}
-
-static void helper_memcpy_4(float* dest, float* src, size_t param_size)
-{
-    size_t rounded_size = 0;
-
-#if defined(__AVX512__) or defined(__AVX256__)
-
-    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
-
-    for (size_t t = 0; t < rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
-        size_t offset = copy_size + t;
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
-            AVX_Data src_4[4];
-            src_4[0].data = SIMD_LOAD(src + i);
-            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
-            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
-            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
-
-            SIMD_STORE(dest + i, src_4[0].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
-            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
-        }
-    }
-#endif
-    if (param_size > rounded_size)
-        helper_memcpy_1((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
-}
-
-static void helper_mempcy_8(float* dest, float* src, size_t param_size)
-{
-    size_t rounded_size = 0;
-
-#if defined(__AVX512__) or defined(__AVX256__)
-
-    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
-
-    for (size_t t = 0; t < rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
-        size_t offset = copy_size + t;
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
-            AVX_Data src_4[8];
-            src_4[0].data = SIMD_LOAD(src + i);
-            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
-            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
-            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
-            src_4[4].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 2));
-            src_4[5].data = SIMD_LOAD(src + i + SIMD_WIDTH * 5);
-            src_4[6].data = SIMD_LOAD(src + i + SIMD_WIDTH * 6);
-            src_4[7].data = SIMD_LOAD(src + i + SIMD_WIDTH * 7);
-
-            SIMD_STORE(dest + i, src_4[0].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
-            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
-            SIMD_STORE(dest + i + (SIMD_WIDTH << 2), src_4[4].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 5, src_4[5].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 6, src_4[6].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 7, src_4[7].data);
-        }
-    }
-#endif
-    if (param_size > rounded_size)
-        helper_memcpy_4((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
-}
-
-int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src)
-{
-    auto dest_c = dest.contiguous();
-    auto src_c = src.contiguous();
-
-    float* dest_ptr = (float*)dest_c.data_ptr();
-    float* src_ptr = (float*)src_c.data_ptr();
-
-    helper_mempcy_8(dest_ptr, src_ptr, dest_c.size(0));
-
-    return 0;
-}
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_copy.h"
+#include <omp.h>
+
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+
+#if defined(__AVX512__) or defined(__AVX256__)
+union AVX_Data {
+#if defined(__AVX512__)
+    __m512 data;
+#else
+    __m256 data;
+#endif
+};
+#endif
+
+static void helper_memcpy_1(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, SIMD_WIDTH);
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
+            AVX_Data src_4;
+            src_4.data = SIMD_LOAD(src + i);
+
+            SIMD_STORE(dest + i, src_4.data);
+        }
+    }
+
+#endif
+
+    if (param_size > rounded_size) {
+#pragma omp parallel for
+        for (size_t k = rounded_size; k < param_size; k++) { dest[k] = src[k]; }
+    }
+}
+
+static void helper_memcpy_4(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
+            AVX_Data src_4[4];
+            src_4[0].data = SIMD_LOAD(src + i);
+            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
+            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
+            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
+
+            SIMD_STORE(dest + i, src_4[0].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
+        }
+    }
+#endif
+    if (param_size > rounded_size)
+        helper_memcpy_1((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
+}
+
+static void helper_mempcy_8(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
+            AVX_Data src_4[8];
+            src_4[0].data = SIMD_LOAD(src + i);
+            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
+            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
+            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
+            src_4[4].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 2));
+            src_4[5].data = SIMD_LOAD(src + i + SIMD_WIDTH * 5);
+            src_4[6].data = SIMD_LOAD(src + i + SIMD_WIDTH * 6);
+            src_4[7].data = SIMD_LOAD(src + i + SIMD_WIDTH * 7);
+
+            SIMD_STORE(dest + i, src_4[0].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 2), src_4[4].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 5, src_4[5].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 6, src_4[6].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 7, src_4[7].data);
+        }
+    }
+#endif
+    if (param_size > rounded_size)
+        helper_memcpy_4((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
+}
+
+int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src)
+{
+    auto dest_c = dest.contiguous();
+    auto src_c = src.contiguous();
+
+    float* dest_ptr = (float*)dest_c.data_ptr();
+    float* src_ptr = (float*)src_c.data_ptr();
+
+    helper_mempcy_8(dest_ptr, src_ptr, dest_c.size(0));
+
+    return 0;
+}
--- a/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@ -1,42 +1,42 @@
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#if (__x86_64__ || __i386__)
-#include <cpuid.h>
-#include <x86intrin.h>
-#endif
-
-#include <deepspeed_aio_common.h>
-#include <stdlib.h>
-#include <torch/extension.h>
-
-#define TILE (1024 * 1024 * 1024)
-
-#if defined(__AVX512__)
-#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm512_loadu_ps(x)
-#define SIMD_SET(x) _mm512_set1_ps(x)
-#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
-#define SIMD_WIDTH 16
-#else
-#if defined(__AVX256__)
-#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm256_loadu_ps(x)
-#define SIMD_SET(x) _mm256_set1_ps(x)
-#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
-#define SIMD_WIDTH 8
-#endif
-#endif
-
-int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#if (__x86_64__ || __i386__)
+#include <cpuid.h>
+#include <x86intrin.h>
+#endif
+
+#include <deepspeed_aio_common.h>
+#include <stdlib.h>
+#include <torch/extension.h>
+
+#define TILE (1024 * 1024 * 1024)
+
+#if defined(__AVX512__)
+#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm512_loadu_ps(x)
+#define SIMD_SET(x) _mm512_set1_ps(x)
+#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_WIDTH 16
+#else
+#if defined(__AVX256__)
+#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm256_loadu_ps(x)
+#define SIMD_SET(x) _mm256_set1_ps(x)
+#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_WIDTH 8
+#endif
+#endif
+
+int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@ -1,41 +1,41 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <torch/extension.h>
-#include "deepspeed_py_aio_handle.h"
-#include "deepspeed_py_copy.h"
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchornous I/O Read");
-
-    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchornous I/O Write");
-
-    m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
-
-    py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
-        .def(py::init<const int, const int, const bool, const bool, const int>())
-
-        .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
-        .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
-        .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
-        .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
-        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
-
-        .def("read", &deepspeed_aio_handle_t::read)
-        .def("write", &deepspeed_aio_handle_t::write)
-
-        .def("pread", &deepspeed_aio_handle_t::pread)
-        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
-
-        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
-        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
-        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
-        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
-
-        .def("wait", &deepspeed_aio_handle_t::wait);
-}
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <torch/extension.h>
+#include "deepspeed_py_aio_handle.h"
+#include "deepspeed_py_copy.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchornous I/O Read");
+
+    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchornous I/O Write");
+
+    m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
+
+    py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
+        .def(py::init<const int, const int, const bool, const bool, const int>())
+
+        .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
+        .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
+        .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
+        .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
+        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
+
+        .def("read", &deepspeed_aio_handle_t::read)
+        .def("write", &deepspeed_aio_handle_t::write)
+
+        .def("pread", &deepspeed_aio_handle_t::pread)
+        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
+
+        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
+        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
+        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
+        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
+
+        .def("wait", &deepspeed_aio_handle_t::wait);
+}
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@ -1,144 +1,144 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import torch
-import os
-import time
-from deepspeed.ops.aio import AsyncIOBuilder
-from multiprocessing import Pool, Barrier
-from test_ds_aio_utils import report_results, task_log, task_barrier
-
-
-def pre_basic(args, tid, read_op):
-    io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
-
-    ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
-    ctxt['buffer'] = buffer
-    ctxt['elapsed_sec'] = 0
-
-    return ctxt
-
-
-def pre_basic_read(pool_params):
-    args, tid = pool_params
-    ctxt = pre_basic(args, tid, True)
-    return ctxt
-
-
-def pre_basic_write(pool_params):
-    args, tid = pool_params
-    ctxt = pre_basic(args, tid, False)
-    return ctxt
-
-
-def post_basic(pool_params):
-    _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
-    return ctxt
-
-
-def main_basic_read(pool_params):
-    args, tid, ctxt = pool_params
-    start_time = time.time()
-    AsyncIOBuilder().load().aio_read(ctxt['buffer'],
-                                     ctxt['file'],
-                                     args.block_size,
-                                     args.queue_depth,
-                                     args.single_submit,
-                                     args.overlap_events,
-                                     args.validate)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_basic_write(pool_params):
-    args, tid, ctxt = pool_params
-    start_time = time.time()
-    AsyncIOBuilder().load().aio_write(ctxt['buffer'],
-                                      ctxt['file'],
-                                      args.block_size,
-                                      args.queue_depth,
-                                      args.single_submit,
-                                      args.overlap_events,
-                                      args.validate)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def get_schedule(args, read_op):
-    schedule = {}
-    if read_op:
-        schedule['pre'] = pre_basic_read
-        schedule['post'] = post_basic
-        schedule['main'] = main_basic_read
-    else:
-        schedule['pre'] = pre_basic_write
-        schedule['post'] = post_basic
-        schedule['main'] = main_basic_write
-
-    return schedule
-
-
-def _aio_handle_tasklet(pool_params):
-    args, tid, read_op = pool_params
-
-    # Create schedule
-    schedule = get_schedule(args, read_op)
-    task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
-
-    # Run pre task
-    task_log(tid, f'running pre-task')
-    ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
-
-    # Run main tasks in a loop
-    ctxt["main_task_sec"] = 0
-    for i in range(args.loops):
-        task_log(tid, f'running main task {i}')
-        start_time = time.time()
-        ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
-        stop_time = time.time()
-        ctxt["main_task_sec"] += stop_time - start_time
-
-    # Run post task
-    task_log(tid, f'running post-task')
-    ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
-
-    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
-
-
-def _init_tasklet(b):
-    global aio_barrier
-    aio_barrier = b
-
-
-def aio_basic_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
-        pool_results = p.map(_aio_handle_tasklet, pool_params)
-
-    report_results(args, read_op, pool_results)
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import AsyncIOBuilder
+from multiprocessing import Pool, Barrier
+from test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_basic(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_basic_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, True)
+    return ctxt
+
+
+def pre_basic_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, False)
+    return ctxt
+
+
+def post_basic(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_basic_read(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    AsyncIOBuilder().load().aio_read(ctxt['buffer'],
+                                     ctxt['file'],
+                                     args.block_size,
+                                     args.queue_depth,
+                                     args.single_submit,
+                                     args.overlap_events,
+                                     args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_basic_write(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    AsyncIOBuilder().load().aio_write(ctxt['buffer'],
+                                      ctxt['file'],
+                                      args.block_size,
+                                      args.queue_depth,
+                                      args.single_submit,
+                                      args.overlap_events,
+                                      args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_basic_read
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_read
+    else:
+        schedule['pre'] = pre_basic_write
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, args.threads)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, args.threads)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, args.threads)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, args.threads)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_tasklet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_basic_multiprocessing(args, read_op):
+    b = Barrier(args.threads)
+    pool_params = [(args, p, read_op) for p in range(args.threads)]
+    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@ -1,176 +1,176 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import torch
-import os
-import time
-from multiprocessing import Pool, Barrier
-from deepspeed.ops.aio import AsyncIOBuilder
-from test_ds_aio_utils import report_results, task_log, task_barrier
-
-
-def pre_handle(args, tid, read_op):
-    io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    if args.gpu:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
-    else:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
-
-    io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size,
-                                                args.queue_depth,
-                                                args.single_submit,
-                                                args.overlap_events,
-                                                io_parallel)
-    task_log(tid, f'created deepspeed aio handle')
-
-    ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
-    ctxt['handle'] = handle
-    ctxt['buffer'] = buffer
-    ctxt['elapsed_sec'] = 0
-
-    return ctxt
-
-
-def pre_handle_read(pool_params):
-    args, tid = pool_params
-    ctxt = pre_handle(args, tid, True)
-    return ctxt
-
-
-def pre_handle_write(pool_params):
-    args, tid = pool_params
-    ctxt = pre_handle(args, tid, False)
-    return ctxt
-
-
-def post_handle(pool_params):
-    _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
-    return ctxt
-
-
-def main_parallel_read(pool_params):
-    args, tid, ctxt = pool_params
-    handle = ctxt['handle']
-
-    start_time = time.time()
-    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
-    assert ret != -1
-    handle.wait()
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_parallel_write(pool_params):
-    args, tid, ctxt = pool_params
-    handle = ctxt['handle']
-    start_time = time.time()
-    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
-    assert ret != -1
-    handle.wait()
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_handle_read(pool_parms):
-    args, tid, ctxt = pool_parms
-    handle = ctxt['handle']
-
-    start_time = time.time()
-    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
-    assert ret != -1
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_handle_write(pool_parms):
-    args, tid, ctxt = pool_parms
-    handle = ctxt['handle']
-    start_time = time.time()
-    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
-    assert ret != -1
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def get_schedule(args, read_op):
-    schedule = {}
-    if read_op:
-        schedule['pre'] = pre_handle_read
-        schedule['post'] = post_handle
-        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
-    else:
-        schedule['pre'] = pre_handle_write
-        schedule['post'] = post_handle
-        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
-
-    return schedule
-
-
-def _aio_handle_tasklet(pool_params):
-    args, tid, read_op = pool_params
-
-    # Create schedule
-    schedule = get_schedule(args, read_op)
-    task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
-
-    # Run pre task
-    task_log(tid, f'running pre-task')
-    ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
-
-    # Run main tasks in a loop
-    ctxt["main_task_sec"] = 0
-    for i in range(args.loops):
-        task_log(tid, f'running main task {i}')
-        start_time = time.time()
-        ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
-        stop_time = time.time()
-        ctxt["main_task_sec"] += stop_time - start_time
-
-    # Run post task
-    task_log(tid, f'running post-task')
-    ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
-
-    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
-
-
-def _init_tasklet(b):
-    global aio_barrier
-    aio_barrier = b
-
-
-def aio_handle_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
-        pool_results = p.map(_aio_handle_tasklet, pool_params)
-
-    report_results(args, read_op, pool_results)
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from multiprocessing import Pool, Barrier
+from deepspeed.ops.aio import AsyncIOBuilder
+from test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_handle(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    if args.gpu:
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
+    else:
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
+
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    handle = AsyncIOBuilder().load().aio_handle(args.block_size,
+                                                args.queue_depth,
+                                                args.single_submit,
+                                                args.overlap_events,
+                                                io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['handle'] = handle
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_handle_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, True)
+    return ctxt
+
+
+def pre_handle_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, False)
+    return ctxt
+
+
+def post_handle(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_parallel_read(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_parallel_write(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+    start_time = time.time()
+    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_read(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_write(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+    start_time = time.time()
+    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_handle_read
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
+    else:
+        schedule['pre'] = pre_handle_write
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, args.threads)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, args.threads)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, args.threads)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, args.threads)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_tasklet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_handle_multiprocessing(args, read_op):
+    b = Barrier(args.threads)
+    pool_params = [(args, p, read_op) for p in range(args.threads)]
+    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
--- a/csrc/aio/py_test/parse_aio_stats.py
+++ b/csrc/aio/py_test/parse_aio_stats.py
@ -1,154 +1,154 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import os
-import argparse
-import re
-
-READ_SPEED = 'read_speed'
-WRITE_SPEED = 'write_speed'
-
-PERF_METRICS = [READ_SPEED, WRITE_SPEED]
-
-METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--log_dir',
-                        type=str,
-                        required=True,
-                        help='Folder of statistics logs')
-
-    parser.add_argument('--metric',
-                        type=str,
-                        required=True,
-                        help='Performance metric to report: [read_speed|write_speed]')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-
-    return args
-
-
-def extract_value(key, file):
-    INVALID_PREFIXES = ["ds"]
-    for p in INVALID_PREFIXES:
-        if key.startswith(p):
-            return key
-    try:
-        if key[0] in ['t', 'd', 'p']:
-            return int(key[1:])
-        if key.startswith("bs"):
-            if key.endswith('K'):
-                v = key[2:].split('K')
-                return int(v[0]) * 1024
-            elif key.endswith('M'):
-                v = key[2:].split('M')
-                return int(v[0]) * 1024 * 1024
-            else:
-                return int(key[2:])
-    except:
-        print(f"{file}: extract_value fails on {key}")
-        return None
-
-    return key
-
-
-def get_file_key(file):
-    f, _ = os.path.splitext(os.path.basename(file))
-    fields = f.split('_')
-    values = [extract_value(k, file) for k in fields]
-    return tuple(values)
-
-
-def get_thread_count(file):
-    f, _ = os.path.splitext(os.path.basename(file))
-    fields = f.split('_')
-    for key in fields:
-        if key[0] == 't':
-            return int(key[1:])
-    return 1
-
-
-"""
-Extract performance metric from log file.
-Sample file lines are:
-Task Read Latency = 0.031647682189941406 sec
-Task Read Speed = 12.342926020792527 GB/sec
-E2E Read Latency = 0.031697988510131836 sec
-E2E Read Speed = 12.323337169333062 GB/sec
-
-For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
-"""
-
-
-def get_metric(file, metric):
-    thread_count = get_thread_count(file)
-    with open(file) as f:
-        for line in f.readlines():
-            if line.startswith(METRIC_SEARCH[metric]):
-                if metric in [READ_SPEED, WRITE_SPEED]:
-                    fields = line.split()
-                    return float(fields[-2])
-                else:
-                    fields = line.split('=')
-                    return float(fields[-1])
-
-    return None
-
-
-def validate_args(args):
-    if not args.metric in PERF_METRICS:
-        print(f'{args.metric} is not a valid performance metrics')
-        return False
-
-    if not os.path.isdir(args.log_dir):
-        print(f'{args.log_dir} folder is not existent')
-        return False
-
-    return True
-
-
-def get_results(log_files, metric):
-    results = {}
-    for f in log_files:
-        file_key = get_file_key(f)
-        value = get_metric(f, metric)
-        results[file_key] = value
-
-    return results
-
-
-def get_sorted_results(log_dir, metric):
-    log_files = [
-        f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir,
-                                                                      f))
-    ]
-
-    log_files_path = [os.path.join(log_dir, f) for f in log_files]
-    results = get_results(log_files_path, metric)
-    result_keys = list(results.keys())
-    sorted_keys = sorted(result_keys)
-    return sorted_keys, results
-
-
-def main():
-    print("Parsing aio statistics")
-    args = parse_arguments()
-
-    if not validate_args(args):
-        quit()
-
-    sorted_keys, results = get_sorted_results(args.log_dir, args.metric)
-    for k in sorted_keys:
-        print(f'{k} = {results[k]}')
-
-
-if __name__ == "__main__":
-    main()
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import argparse
+import re
+
+READ_SPEED = 'read_speed'
+WRITE_SPEED = 'write_speed'
+
+PERF_METRICS = [READ_SPEED, WRITE_SPEED]
+
+METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--log_dir',
+                        type=str,
+                        required=True,
+                        help='Folder of statistics logs')
+
+    parser.add_argument('--metric',
+                        type=str,
+                        required=True,
+                        help='Performance metric to report: [read_speed|write_speed]')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+
+    return args
+
+
+def extract_value(key, file):
+    INVALID_PREFIXES = ["ds"]
+    for p in INVALID_PREFIXES:
+        if key.startswith(p):
+            return key
+    try:
+        if key[0] in ['t', 'd', 'p']:
+            return int(key[1:])
+        if key.startswith("bs"):
+            if key.endswith('K'):
+                v = key[2:].split('K')
+                return int(v[0]) * 1024
+            elif key.endswith('M'):
+                v = key[2:].split('M')
+                return int(v[0]) * 1024 * 1024
+            else:
+                return int(key[2:])
+    except:
+        print(f"{file}: extract_value fails on {key}")
+        return None
+
+    return key
+
+
+def get_file_key(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    values = [extract_value(k, file) for k in fields]
+    return tuple(values)
+
+
+def get_thread_count(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    for key in fields:
+        if key[0] == 't':
+            return int(key[1:])
+    return 1
+
+
+"""
+Extract performance metric from log file.
+Sample file lines are:
+Task Read Latency = 0.031647682189941406 sec
+Task Read Speed = 12.342926020792527 GB/sec
+E2E Read Latency = 0.031697988510131836 sec
+E2E Read Speed = 12.323337169333062 GB/sec
+
+For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
+"""
+
+
+def get_metric(file, metric):
+    thread_count = get_thread_count(file)
+    with open(file) as f:
+        for line in f.readlines():
+            if line.startswith(METRIC_SEARCH[metric]):
+                if metric in [READ_SPEED, WRITE_SPEED]:
+                    fields = line.split()
+                    return float(fields[-2])
+                else:
+                    fields = line.split('=')
+                    return float(fields[-1])
+
+    return None
+
+
+def validate_args(args):
+    if not args.metric in PERF_METRICS:
+        print(f'{args.metric} is not a valid performance metrics')
+        return False
+
+    if not os.path.isdir(args.log_dir):
+        print(f'{args.log_dir} folder is not existent')
+        return False
+
+    return True
+
+
+def get_results(log_files, metric):
+    results = {}
+    for f in log_files:
+        file_key = get_file_key(f)
+        value = get_metric(f, metric)
+        results[file_key] = value
+
+    return results
+
+
+def get_sorted_results(log_dir, metric):
+    log_files = [
+        f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir,
+                                                                      f))
+    ]
+
+    log_files_path = [os.path.join(log_dir, f) for f in log_files]
+    results = get_results(log_files_path, metric)
+    result_keys = list(results.keys())
+    sorted_keys = sorted(result_keys)
+    return sorted_keys, results
+
+
+def main():
+    print("Parsing aio statistics")
+    args = parse_arguments()
+
+    if not validate_args(args):
+        quit()
+
+    sorted_keys, results = get_sorted_results(args.log_dir, args.metric)
+    for k in sorted_keys:
+        print(f'{k} = {results[k]}')
+
+
+if __name__ == "__main__":
+    main()
--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
@ -1,101 +1,101 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import os
-import torch
-import argparse
-import time
-import sys
-from multiprocessing import Pool
-import multiprocessing as mp
-from ds_aio_basic import aio_basic_multiprocessing
-from ds_aio_handle import aio_handle_multiprocessing
-from test_ds_aio_utils import refine_args
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
-
-    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
-
-    parser.add_argument('--write_size',
-                        type=str,
-                        default=None,
-                        help='Number of bytes to write.')
-
-    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
-
-    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
-
-    parser.add_argument('--threads',
-                        type=int,
-                        default=1,
-                        help='Thread parallelism count.')
-
-    parser.add_argument(
-        '--single_submit',
-        action='store_true',
-        help=
-        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
-    )
-
-    parser.add_argument('--overlap_events',
-                        action='store_true',
-                        help='Overlap I/O submission and completion requests.')
-
-    parser.add_argument('--validate',
-                        action='store_true',
-                        help='Perform validation in library.')
-
-    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
-
-    parser.add_argument('--loops',
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
-
-    parser.add_argument('--io_parallel',
-                        type=int,
-                        default=None,
-                        help='Per iop parallelism')
-
-    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
-
-
-def validate_args(args):
-    if args.read_file and not os.path.isfile(args.read_file):
-        print(f'args validation error: {args.read_file} not found')
-        return False
-
-    return True
-
-
-def main():
-    print(f'Testing deepspeed_aio python frontend')
-
-    args = parse_arguments()
-    refine_args(args)
-    if not validate_args(args):
-        quit()
-
-    mp.set_start_method('spawn')
-    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
-    if args.read_file:
-        multiprocess_function(args, True)
-
-    if args.write_file:
-        multiprocess_function(args, False)
-
-
-if __name__ == "__main__":
-    main()
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+import argparse
+import time
+import sys
+from multiprocessing import Pool
+import multiprocessing as mp
+from ds_aio_basic import aio_basic_multiprocessing
+from ds_aio_handle import aio_handle_multiprocessing
+from test_ds_aio_utils import refine_args
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
+
+    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
+
+    parser.add_argument('--write_size',
+                        type=str,
+                        default=None,
+                        help='Number of bytes to write.')
+
+    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
+
+    parser.add_argument('--threads',
+                        type=int,
+                        default=1,
+                        help='Thread parallelism count.')
+
+    parser.add_argument(
+        '--single_submit',
+        action='store_true',
+        help=
+        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
+    )
+
+    parser.add_argument('--overlap_events',
+                        action='store_true',
+                        help='Overlap I/O submission and completion requests.')
+
+    parser.add_argument('--validate',
+                        action='store_true',
+                        help='Perform validation in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops',
+                        type=int,
+                        default=1,
+                        help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel',
+                        type=int,
+                        default=None,
+                        help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def validate_args(args):
+    if args.read_file and not os.path.isfile(args.read_file):
+        print(f'args validation error: {args.read_file} not found')
+        return False
+
+    return True
+
+
+def main():
+    print(f'Testing deepspeed_aio python frontend')
+
+    args = parse_arguments()
+    refine_args(args)
+    if not validate_args(args):
+        quit()
+
+    mp.set_start_method('spawn')
+    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
+    if args.read_file:
+        multiprocess_function(args, True)
+
+    if args.write_file:
+        multiprocess_function(args, False)
+
+
+if __name__ == "__main__":
+    main()
--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@ -1,59 +1,59 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import os
-
-BYTES_PER_GB = 1024**3
-LOG_TIDS = [0]
-
-
-def task_log(tid, msg):
-    if tid in LOG_TIDS:
-        print(f'tid {tid}: {msg}')
-
-
-def task_barrier(barrier, num_parties):
-    assert barrier.parties == num_parties
-    barrier.wait()
-    assert barrier.broken == False
-
-
-def report_results(args, read_op, pool_results):
-    #print(f'pool_results = {pool_results}')
-    io_string = 'Read' if read_op else 'Write'
-    if None in pool_results:
-        print(f'Failure in one of {args.threads} {io_string} processes')
-        return
-
-    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
-
-    task_latency_sec = max([sec for _, sec, _ in pool_results])
-    task_speed_GB = total_bytes / task_latency_sec / BYTES_PER_GB
-    print(f'Task {io_string} Latency = {task_latency_sec} sec')
-    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
-
-    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
-    e2e_speed_GB = total_bytes / e2e_latency_sec / BYTES_PER_GB
-    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
-    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
-
-
-def refine_integer_value(value):
-    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
-
-    if value[-1] in list(unit_dict.keys()):
-        int_value = int(value[:-1]) * unit_dict[value[-1]]
-        return int_value
-    return int(value)
-
-
-def refine_args(args):
-    if args.write_size and type(args.write_size) == str:
-        args.write_size = refine_integer_value(args.write_size)
-
-    if args.block_size and type(args.block_size) == str:
-        args.block_size = refine_integer_value(args.block_size)
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+
+BYTES_PER_GB = 1024**3
+LOG_TIDS = [0]
+
+
+def task_log(tid, msg):
+    if tid in LOG_TIDS:
+        print(f'tid {tid}: {msg}')
+
+
+def task_barrier(barrier, num_parties):
+    assert barrier.parties == num_parties
+    barrier.wait()
+    assert barrier.broken == False
+
+
+def report_results(args, read_op, pool_results):
+    #print(f'pool_results = {pool_results}')
+    io_string = 'Read' if read_op else 'Write'
+    if None in pool_results:
+        print(f'Failure in one of {args.threads} {io_string} processes')
+        return
+
+    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
+
+    task_latency_sec = max([sec for _, sec, _ in pool_results])
+    task_speed_GB = total_bytes / task_latency_sec / BYTES_PER_GB
+    print(f'Task {io_string} Latency = {task_latency_sec} sec')
+    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
+
+    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
+    e2e_speed_GB = total_bytes / e2e_latency_sec / BYTES_PER_GB
+    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
+    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
+
+
+def refine_integer_value(value):
+    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
+
+    if value[-1] in list(unit_dict.keys()):
+        int_value = int(value[:-1]) * unit_dict[value[-1]]
+        return int_value
+    return int(value)
+
+
+def refine_args(args):
+    if args.write_size and type(args.write_size) == str:
+        args.write_size = refine_integer_value(args.write_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
--- a/csrc/common/custom_cuda_kernel.cu
+++ b/csrc/common/custom_cuda_kernel.cu
@ -1,39 +1,39 @@
-#include "custom_cuda_layers.h"
-
-__global__ void param_update_kernel(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id < size) { output[id] = (__half)input[id]; }
-}
-
-void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
-{
-    int threads = 1024;
-
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
-
-__global__ void param_update_kernel_half(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    __half2* output_cast = reinterpret_cast<__half2*>(output);
-    if (id < size) {
-        float input_f = input[id];
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-        output_cast[id] = *input_h;
-    }
-}
-
-void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream)
-{
-    int threads = 1024;
-    size /= 2;
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-    param_update_kernel_half<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
+#include "custom_cuda_layers.h"
+
+__global__ void param_update_kernel(const float* input, __half* output, int size)
+{
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (id < size) { output[id] = (__half)input[id]; }
+}
+
+void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
+{
+    int threads = 1024;
+
+    dim3 grid_dim((size - 1) / threads + 1);
+    dim3 block_dim(threads);
+
+    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
+}
+
+__global__ void param_update_kernel_half(const float* input, __half* output, int size)
+{
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    __half2* output_cast = reinterpret_cast<__half2*>(output);
+    if (id < size) {
+        float input_f = input[id];
+        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
+        output_cast[id] = *input_h;
+    }
+}
+
+void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream)
+{
+    int threads = 1024;
+    size /= 2;
+    dim3 grid_dim((size - 1) / threads + 1);
+    dim3 block_dim(threads);
+
+    param_update_kernel_half<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
+}
--- a/csrc/includes/Timer.h
+++ b/csrc/includes/Timer.h
@ -1,47 +1,47 @@
-
-#ifndef __TIMER_H__
-#define __TIMER_H__
-
-#include <cuda_runtime.h>
-#include <chrono>
-#include "cuda.h"
-
-class GPUTimer {
-    cudaEvent_t start, stop;
-
-public:
-    GPUTimer()
-    {
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-    }
-    ~GPUTimer()
-    {
-        cudaEventDestroy(start);
-        cudaEventDestroy(stop);
-    }
-    inline void Record() { cudaEventRecord(start); }
-    inline void Elapsed(float& time_elapsed)
-    {
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&time_elapsed, start, stop);
-    }
-};
-
-class CPUTimer {
-    std::chrono::high_resolution_clock::time_point start;
-
-public:
-    CPUTimer() : start(std::chrono::high_resolution_clock::now()) {}
-    inline void Reset() { start = std::chrono::high_resolution_clock::now(); }
-    inline float Elapsed()
-    {
-        auto temp = start;
-        start = std::chrono::high_resolution_clock::now();
-        return (float)(std::chrono::duration_cast<std::chrono::microseconds>(start - temp).count() /
-                       1e3);
-    }
-};
-
-#endif
+
+#ifndef __TIMER_H__
+#define __TIMER_H__
+
+#include <cuda_runtime.h>
+#include <chrono>
+#include "cuda.h"
+
+class GPUTimer {
+    cudaEvent_t start, stop;
+
+public:
+    GPUTimer()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+    }
+    ~GPUTimer()
+    {
+        cudaEventDestroy(start);
+        cudaEventDestroy(stop);
+    }
+    inline void Record() { cudaEventRecord(start); }
+    inline void Elapsed(float& time_elapsed)
+    {
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&time_elapsed, start, stop);
+    }
+};
+
+class CPUTimer {
+    std::chrono::high_resolution_clock::time_point start;
+
+public:
+    CPUTimer() : start(std::chrono::high_resolution_clock::now()) {}
+    inline void Reset() { start = std::chrono::high_resolution_clock::now(); }
+    inline float Elapsed()
+    {
+        auto temp = start;
+        start = std::chrono::high_resolution_clock::now();
+        return (float)(std::chrono::duration_cast<std::chrono::microseconds>(start - temp).count() /
+                       1e3);
+    }
+};
+
+#endif
--- a/csrc/includes/cpu_adagrad.h
+++ b/csrc/includes/cpu_adagrad.h
@ -1,147 +1,147 @@
-#pragma once
-
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "cuda.h"
-#include "custom_cuda_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adagrad_Optimizer {
-public:
-    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
-        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
-    {
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adagrad_Optimizer()
-    {
-        cudaFreeHost(_doubled_buffer[0]);
-        cudaFreeHost(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step)
-    {
-        _step++;
-        if (_step != step) { _step = step; }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-    }
-
-private:
-    float _alpha;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-
-    cudaStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
-                                 float* _params,
-                                 float* grads,
-                                 float* _exp_avg_sq,
-                                 size_t _param_size,
-                                 __half* dev_params,
-                                 bool half_precision)
-{
-    size_t new_rounded_size = 0;
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, grads + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
-
-            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_add<span>(grad_4, grad_4, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+#include <cassert>
+#include "cuda.h"
+#include "custom_cuda_layers.h"
+#include "simd.h"
+
+#define STEP(SPAN)                                \
+    void Step_##SPAN(float* _params,              \
+                     float* grads,                \
+                     float* _exp_avg_sq,          \
+                     size_t _param_size,          \
+                     __half* dev_param = nullptr, \
+                     bool half_precision = false);
+
+class Adagrad_Optimizer {
+public:
+    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
+        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
+    {
+        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+
+        _streams[0] = Context::Instance().GetCurrentStream();
+        _streams[1] = Context::Instance().GetNewStream();
+    }
+    ~Adagrad_Optimizer()
+    {
+        cudaFreeHost(_doubled_buffer[0]);
+        cudaFreeHost(_doubled_buffer[1]);
+    }
+#if defined(__AVX512__) or defined(__AVX256__)
+    template <int span>
+    void Step_AVX(size_t* rounded_size,
+                  float* _params,
+                  float* grads,
+                  float* _exp_avg_sq,
+                  size_t param_size,
+                  __half* dev_param = nullptr,
+                  bool half_precision = false);
+#endif
+    STEP(1)
+    STEP(4)
+    STEP(8)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
+    }
+    inline void IncrementStep(size_t step)
+    {
+        _step++;
+        if (_step != step) { _step = step; }
+    }
+    inline void update_state(float lr, float epsilon, float weight_decay)
+    {
+        _alpha = lr;
+        _eps = epsilon;
+        _weight_decay = weight_decay;
+    }
+
+private:
+    float _alpha;
+    float _eps;
+    float _weight_decay;
+
+    float _betta1_t;
+    float _betta2_t;
+    size_t _step;
+
+    float* _doubled_buffer[2];
+    bool _buf_index;
+
+    cudaStream_t _streams[2];
+};
+
+#if defined(__AVX512__) or defined(__AVX256__)
+template <int span>
+void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
+                                 float* _params,
+                                 float* grads,
+                                 float* _exp_avg_sq,
+                                 size_t _param_size,
+                                 __half* dev_params,
+                                 bool half_precision)
+{
+    size_t new_rounded_size = 0;
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+
+    float step_size = -1 * _alpha;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + i, half_precision);
+
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, grads + i, false);
+
+            AVX_Data variance_4[span];
+            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + i, half_precision);
+
+            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
+
+            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
+            simd_sqrt<span>(grad_4, variance_4);
+            simd_add<span>(grad_4, grad_4, eps_4);
+            simd_div<span>(grad_4, momentum_4, grad_4);
+            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
+
+            simd_store<span>(_params + i, param_4, half_precision);
+            if (dev_params) {
+                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
+            }
+            simd_store<span>(_exp_avg_sq + i, variance_4, false);
+        }
+
+        if (dev_params) {
+            if (half_precision)
+                launch_param_update_half(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            else
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+
+            _buf_index = !_buf_index;
+        }
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@ -1,222 +1,222 @@
-#pragma once
-
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "cuda.h"
-#include "custom_cuda_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg,             \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adam_Optimizer {
-public:
-    Adam_Optimizer(float alpha = 1e-3,
-                   float betta1 = 0.9,
-                   float betta2 = 0.999,
-                   float eps = 1e-8,
-                   float weight_decay = 0,
-                   bool adamw_mode = true)
-        : _alpha(alpha),
-          _betta1(betta1),
-          _betta2(betta2),
-          _eps(eps),
-          _weight_decay(weight_decay),
-          _betta1_t(1.0),
-          _betta2_t(1.0),
-          _step(0),
-          _buf_index(false),
-          _adamw_mode(adamw_mode)
-    {
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adam_Optimizer()
-    {
-        cudaFreeHost(_doubled_buffer[0]);
-        cudaFreeHost(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step, float beta1, float beta2)
-    {
-        if (beta1 != _betta1 || beta2 != _betta2) {
-            _step = step;
-            _betta1 = beta1;
-            _betta2 = beta2;
-            _betta1_t = std::pow(_betta1, step);
-            _betta2_t = std::pow(_betta2, step);
-        } else {
-            _step++;
-            if (_step != step) {
-                _betta1_t = std::pow(_betta1, step);
-                _betta2_t = std::pow(_betta2, step);
-                _step = step;
-            } else {
-                _betta1_t *= _betta1;
-                _betta2_t *= _betta2;
-            }
-        }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-
-        _bias_correction1 = 1.0f;
-        _bias_correction2 = 1.0f;
-        if (bias_correction == 1) {
-            _bias_correction1 = 1 - _betta1_t;
-            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
-        }
-    }
-
-private:
-    float _alpha;
-    float _betta1;
-    float _betta2;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float _bias_correction1;
-    float _bias_correction2;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-    bool _adamw_mode;
-
-    cudaStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adam_Optimizer::Step_AVX(size_t* rounded_size,
-                              float* _params,
-                              float* grads,
-                              float* _exp_avg,
-                              float* _exp_avg_sq,
-                              size_t _param_size,
-                              __half* dev_params,
-                              bool half_precision)
-{
-    size_t new_rounded_size = 0;
-
-    AVX_Data betta1_4;
-    betta1_4.data = SIMD_SET(_betta1);
-    AVX_Data betta2_4;
-    betta2_4.data = SIMD_SET(_betta2);
-
-    float betta1_minus1 = 1 - _betta1;
-    float betta2_minus1 = 1 - _betta2;
-    AVX_Data betta1_minus1_4;
-    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
-    AVX_Data betta2_minus1_4;
-    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
-
-    AVX_Data bias2_sqrt;
-    bias2_sqrt.data = SIMD_SET(_bias_correction2);
-
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha / _bias_correction1;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    float w_decay = -1 * _alpha * _weight_decay;
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0)
-        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, _exp_avg + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0 && !_adamw_mode) {
-                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
-            }
-
-            simd_mul<span>(momentum_4, momentum_4, betta1_4);
-            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
-            simd_mul<span>(variance_4, variance_4, betta2_4);
-            simd_mul<span>(grad_4, grad_4, grad_4);
-            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-
-            if (_weight_decay > 0 && _adamw_mode) {
-                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
-            }
-
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg + i, momentum_4, false);
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+#include <cassert>
+#include "cuda.h"
+#include "custom_cuda_layers.h"
+#include "simd.h"
+
+#define STEP(SPAN)                                \
+    void Step_##SPAN(float* _params,              \
+                     float* grads,                \
+                     float* _exp_avg,             \
+                     float* _exp_avg_sq,          \
+                     size_t _param_size,          \
+                     __half* dev_param = nullptr, \
+                     bool half_precision = false);
+
+class Adam_Optimizer {
+public:
+    Adam_Optimizer(float alpha = 1e-3,
+                   float betta1 = 0.9,
+                   float betta2 = 0.999,
+                   float eps = 1e-8,
+                   float weight_decay = 0,
+                   bool adamw_mode = true)
+        : _alpha(alpha),
+          _betta1(betta1),
+          _betta2(betta2),
+          _eps(eps),
+          _weight_decay(weight_decay),
+          _betta1_t(1.0),
+          _betta2_t(1.0),
+          _step(0),
+          _buf_index(false),
+          _adamw_mode(adamw_mode)
+    {
+        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+
+        _streams[0] = Context::Instance().GetCurrentStream();
+        _streams[1] = Context::Instance().GetNewStream();
+    }
+    ~Adam_Optimizer()
+    {
+        cudaFreeHost(_doubled_buffer[0]);
+        cudaFreeHost(_doubled_buffer[1]);
+    }
+#if defined(__AVX512__) or defined(__AVX256__)
+    template <int span>
+    void Step_AVX(size_t* rounded_size,
+                  float* _params,
+                  float* grads,
+                  float* _exp_avg,
+                  float* _exp_avg_sq,
+                  size_t param_size,
+                  __half* dev_param = nullptr,
+                  bool half_precision = false);
+#endif
+    STEP(1)
+    STEP(4)
+    STEP(8)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
+    }
+    inline void IncrementStep(size_t step, float beta1, float beta2)
+    {
+        if (beta1 != _betta1 || beta2 != _betta2) {
+            _step = step;
+            _betta1 = beta1;
+            _betta2 = beta2;
+            _betta1_t = std::pow(_betta1, step);
+            _betta2_t = std::pow(_betta2, step);
+        } else {
+            _step++;
+            if (_step != step) {
+                _betta1_t = std::pow(_betta1, step);
+                _betta2_t = std::pow(_betta2, step);
+                _step = step;
+            } else {
+                _betta1_t *= _betta1;
+                _betta2_t *= _betta2;
+            }
+        }
+    }
+    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
+    {
+        _alpha = lr;
+        _eps = epsilon;
+        _weight_decay = weight_decay;
+
+        _bias_correction1 = 1.0f;
+        _bias_correction2 = 1.0f;
+        if (bias_correction == 1) {
+            _bias_correction1 = 1 - _betta1_t;
+            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
+        }
+    }
+
+private:
+    float _alpha;
+    float _betta1;
+    float _betta2;
+    float _eps;
+    float _weight_decay;
+
+    float _betta1_t;
+    float _betta2_t;
+    size_t _step;
+
+    float _bias_correction1;
+    float _bias_correction2;
+
+    float* _doubled_buffer[2];
+    bool _buf_index;
+    bool _adamw_mode;
+
+    cudaStream_t _streams[2];
+};
+
+#if defined(__AVX512__) or defined(__AVX256__)
+template <int span>
+void Adam_Optimizer::Step_AVX(size_t* rounded_size,
+                              float* _params,
+                              float* grads,
+                              float* _exp_avg,
+                              float* _exp_avg_sq,
+                              size_t _param_size,
+                              __half* dev_params,
+                              bool half_precision)
+{
+    size_t new_rounded_size = 0;
+
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
+
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
+
+    AVX_Data bias2_sqrt;
+    bias2_sqrt.data = SIMD_SET(_bias_correction2);
+
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+
+    float step_size = -1 * _alpha / _bias_correction1;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    float w_decay = -1 * _alpha * _weight_decay;
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0)
+        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + i, half_precision);
+
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, _exp_avg + i, false);
+
+            AVX_Data variance_4[span];
+            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + i, half_precision);
+
+            if (_weight_decay > 0 && !_adamw_mode) {
+                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
+            }
+
+            simd_mul<span>(momentum_4, momentum_4, betta1_4);
+            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
+            simd_mul<span>(variance_4, variance_4, betta2_4);
+            simd_mul<span>(grad_4, grad_4, grad_4);
+            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
+            simd_sqrt<span>(grad_4, variance_4);
+            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
+            simd_div<span>(grad_4, momentum_4, grad_4);
+
+            if (_weight_decay > 0 && _adamw_mode) {
+                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
+            }
+
+            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
+
+            simd_store<span>(_params + i, param_4, half_precision);
+            if (dev_params) {
+                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
+            }
+            simd_store<span>(_exp_avg + i, momentum_4, false);
+            simd_store<span>(_exp_avg_sq + i, variance_4, false);
+        }
+
+        if (dev_params) {
+            if (half_precision)
+                launch_param_update_half(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            else
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+
+            _buf_index = !_buf_index;
+        }
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
--- a/csrc/includes/dropout.h
+++ b/csrc/includes/dropout.h
@ -1,76 +1,76 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-
-template <typename T>
-class Dropout {
-public:
-    struct Config {
-        float ratio;
-        uint32_t dim;
-        bool training;
-
-        Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
-
-        float RATIO() const { return training ? ratio : 0.0; }
-        inline void SetDim(uint32_t d) { dim = d; }
-    };
-
-    Dropout(const Config& config) : _config(config), _mask(nullptr) {}
-
-    virtual ~Dropout() {}
-
-    void Forward(int bsz, T* out, const T* vals, cudaStream_t stream, bool bwd = false)
-    {
-        launch_dropout<T>(
-            out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
-    }
-
-    void ForwardWithBias(int bsz, T* vals, const T* bias, cudaStream_t stream)
-    {
-        launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
-    }
-
-    void ForwardWithBias(int bsz,
-                         T* out,
-                         const T* vals,
-                         const T* residual,
-                         const T* bias,
-                         cudaStream_t stream)
-    {
-        launch_dropout<T>(
-            out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
-    }
-
-    void Backward(int bsz, T* d_vals, cudaStream_t stream)
-    {
-        launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
-    }
-
-    void Backward(int bsz, T* d_vals_out, const T* d_vals, cudaStream_t stream)
-    {
-        launch_dropout_grad<T>(
-            d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
-    }
-
-    bool HasDropout() const { return _config.RATIO() > 0.0; }
-
-    void SetTrainingMode(bool training) { _config.training = training; }
-
-    void SetMask(uint8_t* mask)
-    {
-        if (!mask) { throw std::runtime_error("Dropout mask is null."); }
-
-        _mask = mask;
-    }
-
-    Config GetConfig() const { return _config; }
-
-    inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
-
-private:
-    uint8_t* _mask;
-    Config _config;
-};
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+template <typename T>
+class Dropout {
+public:
+    struct Config {
+        float ratio;
+        uint32_t dim;
+        bool training;
+
+        Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
+
+        float RATIO() const { return training ? ratio : 0.0; }
+        inline void SetDim(uint32_t d) { dim = d; }
+    };
+
+    Dropout(const Config& config) : _config(config), _mask(nullptr) {}
+
+    virtual ~Dropout() {}
+
+    void Forward(int bsz, T* out, const T* vals, cudaStream_t stream, bool bwd = false)
+    {
+        launch_dropout<T>(
+            out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
+    }
+
+    void ForwardWithBias(int bsz, T* vals, const T* bias, cudaStream_t stream)
+    {
+        launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
+    }
+
+    void ForwardWithBias(int bsz,
+                         T* out,
+                         const T* vals,
+                         const T* residual,
+                         const T* bias,
+                         cudaStream_t stream)
+    {
+        launch_dropout<T>(
+            out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
+    }
+
+    void Backward(int bsz, T* d_vals, cudaStream_t stream)
+    {
+        launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
+    }
+
+    void Backward(int bsz, T* d_vals_out, const T* d_vals, cudaStream_t stream)
+    {
+        launch_dropout_grad<T>(
+            d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
+    }
+
+    bool HasDropout() const { return _config.RATIO() > 0.0; }
+
+    void SetTrainingMode(bool training) { _config.training = training; }
+
+    void SetMask(uint8_t* mask)
+    {
+        if (!mask) { throw std::runtime_error("Dropout mask is null."); }
+
+        _mask = mask;
+    }
+
+    Config GetConfig() const { return _config; }
+
+    inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
+
+private:
+    uint8_t* _mask;
+    Config _config;
+};
--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
@ -1,93 +1,93 @@
-#ifndef __FEEDFORWARD_H__
-#define __FEEDFORWARD_H__
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include "custom_cuda_layers.h"
-
-template <typename T>
-class FeedForward {
-public:
-    struct Config {
-        int batchSize, outputSize;
-        int inputSize;
-        std::array<int, 3> gemm_algos;
-        Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
-            : batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
-        {
-        }
-    };
-
-    FeedForward(Config config) : config_(config) {}
-
-    ~FeedForward() {}
-
-    void Forward(int bsz,
-                 const T* input_ptr,
-                 const T* weights,
-                 T* out,
-                 cublasHandle_t& _cublasHandle)
-    {
-        float alpha = T(1.);
-        float beta = T(0.);
-
-        cublas_gemm_ex(_cublasHandle,
-                       CUBLAS_OP_T,
-                       CUBLAS_OP_N,
-                       config_.outputSize,
-                       bsz,
-                       config_.inputSize,
-                       &alpha,
-                       &beta,
-                       weights,
-                       input_ptr,
-                       out,
-                       cublasGemmAlgo_t(config_.gemm_algos[0]));
-    }
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* input_ptr,
-                  const T* weights,
-                  T* weights_grad,
-                  T* bias_grad,
-                  cublasHandle_t& _cublasHandle,
-                  cudaStream_t& stream,
-                  T* inp_grad_out = nullptr,
-                  T* out_grad_trans_out = nullptr)
-    {
-        float alpha = (T)1.0, beta = (T)0.0;
-        cublas_gemm_ex(_cublasHandle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_T,
-                       config_.inputSize,
-                       config_.outputSize,
-                       bsz,
-                       &alpha,
-                       &beta,
-                       input_ptr,
-                       out_grad,
-                       weights_grad,
-                       cublasGemmAlgo_t(config_.gemm_algos[1]));
-
-        cublas_gemm_ex(_cublasHandle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       config_.inputSize,
-                       bsz,
-                       config_.outputSize,
-                       &alpha,
-                       &beta,
-                       weights,
-                       out_grad,
-                       inp_grad_out,
-                       cublasGemmAlgo_t(config_.gemm_algos[2]));
-
-        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
-    }
-
-private:
-    Config config_;
-};
-
-#endif
+#ifndef __FEEDFORWARD_H__
+#define __FEEDFORWARD_H__
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include "custom_cuda_layers.h"
+
+template <typename T>
+class FeedForward {
+public:
+    struct Config {
+        int batchSize, outputSize;
+        int inputSize;
+        std::array<int, 3> gemm_algos;
+        Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
+            : batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
+        {
+        }
+    };
+
+    FeedForward(Config config) : config_(config) {}
+
+    ~FeedForward() {}
+
+    void Forward(int bsz,
+                 const T* input_ptr,
+                 const T* weights,
+                 T* out,
+                 cublasHandle_t& _cublasHandle)
+    {
+        float alpha = T(1.);
+        float beta = T(0.);
+
+        cublas_gemm_ex(_cublasHandle,
+                       CUBLAS_OP_T,
+                       CUBLAS_OP_N,
+                       config_.outputSize,
+                       bsz,
+                       config_.inputSize,
+                       &alpha,
+                       &beta,
+                       weights,
+                       input_ptr,
+                       out,
+                       cublasGemmAlgo_t(config_.gemm_algos[0]));
+    }
+    void Backward(int bsz,
+                  const T* out_grad,
+                  const T* input_ptr,
+                  const T* weights,
+                  T* weights_grad,
+                  T* bias_grad,
+                  cublasHandle_t& _cublasHandle,
+                  cudaStream_t& stream,
+                  T* inp_grad_out = nullptr,
+                  T* out_grad_trans_out = nullptr)
+    {
+        float alpha = (T)1.0, beta = (T)0.0;
+        cublas_gemm_ex(_cublasHandle,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_T,
+                       config_.inputSize,
+                       config_.outputSize,
+                       bsz,
+                       &alpha,
+                       &beta,
+                       input_ptr,
+                       out_grad,
+                       weights_grad,
+                       cublasGemmAlgo_t(config_.gemm_algos[1]));
+
+        cublas_gemm_ex(_cublasHandle,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       config_.inputSize,
+                       bsz,
+                       config_.outputSize,
+                       &alpha,
+                       &beta,
+                       weights,
+                       out_grad,
+                       inp_grad_out,
+                       cublasGemmAlgo_t(config_.gemm_algos[2]));
+
+        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
+    }
+
+private:
+    Config config_;
+};
+
+#endif
--- a/csrc/includes/gelu.h
+++ b/csrc/includes/gelu.h
@ -1,36 +1,36 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include "custom_cuda_layers.h"
-
-template <typename T>
-class Gelu {
-public:
-    struct Config {
-        uint32_t intermediate_size;
-        Config(uint32_t inter_size) : intermediate_size(inter_size) {}
-    };
-
-    Gelu(const Config& config) : _config(config) {}
-
-    virtual ~Gelu() {}
-
-    void ForwardWithBiasAdd(int bsz,
-                            const T* input_buf,
-                            const T* bias,
-                            T* output,
-                            cudaStream_t stream)
-    {
-        launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
-    }
-
-    void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream)
-    {
-        launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
-    }
-
-private:
-    Config _config;
-};
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include "custom_cuda_layers.h"
+
+template <typename T>
+class Gelu {
+public:
+    struct Config {
+        uint32_t intermediate_size;
+        Config(uint32_t inter_size) : intermediate_size(inter_size) {}
+    };
+
+    Gelu(const Config& config) : _config(config) {}
+
+    virtual ~Gelu() {}
+
+    void ForwardWithBiasAdd(int bsz,
+                            const T* input_buf,
+                            const T* bias,
+                            T* output,
+                            cudaStream_t stream)
+    {
+        launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
+    }
+
+    void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream)
+    {
+        launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
+    }
+
+private:
+    Config _config;
+};
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@ -1,293 +1,293 @@
-
-#pragma once
-
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
-#include <array>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <limits>
-#include <memory>
-#include "StopWatch.h"
-#include "cublas_wrappers.h"
-
-template <typename T>
-void check(T result, char const* const func, const char* const file, int const line)
-{
-    if (result) {
-        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
-                      " \n");
-    }
-}
-
-#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
-
-template <typename T>
-class GemmTest {
-public:
-    GemmTest(int m, int n, int k, cublasOperation_t ta, cublasOperation_t tb, cublasHandle_t h)
-        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-    {
-        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K));
-        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N));
-        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N));
-    }
-
-    ~GemmTest()
-    {
-        check_cuda_error(cudaFree(A));
-        check_cuda_error(cudaFree(B));
-        check_cuda_error(cudaFree(C));
-    }
-
-    std::array<int, 3> TestAlgo(int loops)
-    {
-        float alpha = (T)1.0f;
-        float beta = (T)0.0f;
-
-        int algo_fw = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           N,
-                           M,
-                           K,
-                           &alpha,
-                           &beta,
-                           B,
-                           A,
-                           C,
-                           static_cast<cublasGemmAlgo_t>(algo));
-        });
-
-        int algo_bw1 = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           CUBLAS_OP_N,
-                           CUBLAS_OP_T,
-                           K,
-                           N,
-                           M,
-                           &alpha,
-                           &beta,
-                           A,
-                           C,
-                           B,
-                           static_cast<cublasGemmAlgo_t>(algo));
-        });
-
-        int algo_bw2 = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           CUBLAS_OP_N,
-                           CUBLAS_OP_N,
-                           K,
-                           M,
-                           N,
-                           &alpha,
-                           &beta,
-                           B,
-                           C,
-                           A,
-                           static_cast<cublasGemmAlgo_t>(algo));
-        });
-
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-    }
-
-    template <typename Func>
-    int Run(int loops, Func f)
-    {
-        float fast_latency = (std::numeric_limits<float>::max)();
-        int fast_algo = 0;
-
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-             algo++) {
-            int warm_up = 5;
-            for (int i = 0; i < warm_up; ++i) f(algo);
-
-            cudaDeviceSynchronize();
-            Stopwatch timer;
-            timer.Restart();
-
-            for (int i = 0; i < loops; ++i) f(algo);
-
-            cudaDeviceSynchronize();
-            timer.Stop();
-
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
-
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
-
-            if (avg_latency < fast_latency) {
-                fast_latency = avg_latency;
-                fast_algo = algo;
-            }
-        }
-
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
-
-        return fast_algo;
-    }
-
-private:
-    int M, N, K;
-    cublasHandle_t handle;
-    cublasOperation_t transa, transb;
-    T *A, *B, *C;
-};
-
-template <typename T>
-class StridedGemmTest {
-public:
-    StridedGemmTest(int b,
-                    int m,
-                    int n,
-                    int k,
-                    cublasOperation_t ta,
-                    cublasOperation_t tb,
-                    cublasHandle_t h)
-        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-    {
-        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K * bsz));
-        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N * bsz));
-        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N * bsz));
-    }
-
-    ~StridedGemmTest()
-    {
-        check_cuda_error(cudaFree(A));
-        check_cuda_error(cudaFree(B));
-        check_cuda_error(cudaFree(C));
-    }
-
-    std::array<int, 3> TestAlgo(int loops)
-    {
-        float alpha = (T)1.0f;
-        float beta = (T)0.0f;
-
-        int algo_fw = Run(loops, [=](int algo) {
-            int stride_a = M * K;
-            int stride_b = N * K;
-            int stride_c = M * N;
-
-            cublas_strided_batched_gemm(handle,
-                                        M,
-                                        N,
-                                        K,
-                                        &alpha,
-                                        &beta,
-                                        A,
-                                        B,
-                                        C,
-                                        transa,
-                                        transb,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-                                        static_cast<cublasGemmAlgo_t>(algo));
-        });
-
-        int algo_bw1 = Run(loops, [=](int algo) {
-            int mb = (transa == CUBLAS_OP_T ? K : M);
-            int kb = (transa == CUBLAS_OP_T ? M : K);
-
-            int stride_a = mb * N;
-            int stride_b = N * kb;
-            int stride_c = M * K;
-
-            // B need to transpose.
-            cublasOperation_t op_b = (transb == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-            // Calculate d_A.
-            cublas_strided_batched_gemm(handle,
-                                        mb,
-                                        kb,
-                                        N,
-                                        &alpha,
-                                        &beta,
-                                        (transa == CUBLAS_OP_T ? B : C),
-                                        (transa == CUBLAS_OP_T ? C : B),
-                                        A,
-                                        CUBLAS_OP_N,
-                                        op_b,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-                                        static_cast<cublasGemmAlgo_t>(algo));
-        });
-
-        int algo_bw2 = Run(loops, [=](int algo) {
-            // A need to transpose.
-            cublasOperation_t op_a = (transa == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-            int stride_a = M * K;
-            int stride_b = M * N;
-            int stride_c = N * K;
-
-            // Calculate d_B.
-            cublas_strided_batched_gemm(handle,
-                                        K,
-                                        N,
-                                        M,
-                                        &alpha,
-                                        &beta,
-                                        A,
-                                        C,
-                                        B,
-                                        op_a,
-                                        CUBLAS_OP_N,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-                                        static_cast<cublasGemmAlgo_t>(algo));
-        });
-
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-    }
-
-    template <typename Func>
-    int Run(int loops, Func f)
-    {
-        float fast_latency = (std::numeric_limits<float>::max)();
-        int fast_algo = 0;
-
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-             algo++) {
-            int warm_up = 5;
-            for (int i = 0; i < warm_up; ++i) f(algo);
-
-            cudaDeviceSynchronize();
-            Stopwatch timer;
-            timer.Restart();
-
-            for (int i = 0; i < loops; ++i) f(algo);
-
-            cudaDeviceSynchronize();
-            timer.Stop();
-
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
-
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
-
-            if (avg_latency < fast_latency) {
-                fast_latency = avg_latency;
-                fast_algo = algo;
-            }
-        }
-
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
-
-        return fast_algo;
-    }
-
-private:
-    int bsz, M, N, K;
-    cublasHandle_t handle;
-    cublasOperation_t transa, transb;
-    T *A, *B, *C;
-};
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <limits>
+#include <memory>
+#include "StopWatch.h"
+#include "cublas_wrappers.h"
+
+template <typename T>
+void check(T result, char const* const func, const char* const file, int const line)
+{
+    if (result) {
+        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
+                      " \n");
+    }
+}
+
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+
+template <typename T>
+class GemmTest {
+public:
+    GemmTest(int m, int n, int k, cublasOperation_t ta, cublasOperation_t tb, cublasHandle_t h)
+        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
+    {
+        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K));
+        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N));
+        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N));
+    }
+
+    ~GemmTest()
+    {
+        check_cuda_error(cudaFree(A));
+        check_cuda_error(cudaFree(B));
+        check_cuda_error(cudaFree(C));
+    }
+
+    std::array<int, 3> TestAlgo(int loops)
+    {
+        float alpha = (T)1.0f;
+        float beta = (T)0.0f;
+
+        int algo_fw = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           N,
+                           M,
+                           K,
+                           &alpha,
+                           &beta,
+                           B,
+                           A,
+                           C,
+                           static_cast<cublasGemmAlgo_t>(algo));
+        });
+
+        int algo_bw1 = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_T,
+                           K,
+                           N,
+                           M,
+                           &alpha,
+                           &beta,
+                           A,
+                           C,
+                           B,
+                           static_cast<cublasGemmAlgo_t>(algo));
+        });
+
+        int algo_bw2 = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_N,
+                           K,
+                           M,
+                           N,
+                           &alpha,
+                           &beta,
+                           B,
+                           C,
+                           A,
+                           static_cast<cublasGemmAlgo_t>(algo));
+        });
+
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+    }
+
+    template <typename Func>
+    int Run(int loops, Func f)
+    {
+        float fast_latency = (std::numeric_limits<float>::max)();
+        int fast_algo = 0;
+
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+             algo++) {
+            int warm_up = 5;
+            for (int i = 0; i < warm_up; ++i) f(algo);
+
+            cudaDeviceSynchronize();
+            Stopwatch timer;
+            timer.Restart();
+
+            for (int i = 0; i < loops; ++i) f(algo);
+
+            cudaDeviceSynchronize();
+            timer.Stop();
+
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
+
+            if (avg_latency < fast_latency) {
+                fast_latency = avg_latency;
+                fast_algo = algo;
+            }
+        }
+
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+
+        return fast_algo;
+    }
+
+private:
+    int M, N, K;
+    cublasHandle_t handle;
+    cublasOperation_t transa, transb;
+    T *A, *B, *C;
+};
+
+template <typename T>
+class StridedGemmTest {
+public:
+    StridedGemmTest(int b,
+                    int m,
+                    int n,
+                    int k,
+                    cublasOperation_t ta,
+                    cublasOperation_t tb,
+                    cublasHandle_t h)
+        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
+    {
+        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K * bsz));
+        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N * bsz));
+        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N * bsz));
+    }
+
+    ~StridedGemmTest()
+    {
+        check_cuda_error(cudaFree(A));
+        check_cuda_error(cudaFree(B));
+        check_cuda_error(cudaFree(C));
+    }
+
+    std::array<int, 3> TestAlgo(int loops)
+    {
+        float alpha = (T)1.0f;
+        float beta = (T)0.0f;
+
+        int algo_fw = Run(loops, [=](int algo) {
+            int stride_a = M * K;
+            int stride_b = N * K;
+            int stride_c = M * N;
+
+            cublas_strided_batched_gemm(handle,
+                                        M,
+                                        N,
+                                        K,
+                                        &alpha,
+                                        &beta,
+                                        A,
+                                        B,
+                                        C,
+                                        transa,
+                                        transb,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+                                        static_cast<cublasGemmAlgo_t>(algo));
+        });
+
+        int algo_bw1 = Run(loops, [=](int algo) {
+            int mb = (transa == CUBLAS_OP_T ? K : M);
+            int kb = (transa == CUBLAS_OP_T ? M : K);
+
+            int stride_a = mb * N;
+            int stride_b = N * kb;
+            int stride_c = M * K;
+
+            // B need to transpose.
+            cublasOperation_t op_b = (transb == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+
+            // Calculate d_A.
+            cublas_strided_batched_gemm(handle,
+                                        mb,
+                                        kb,
+                                        N,
+                                        &alpha,
+                                        &beta,
+                                        (transa == CUBLAS_OP_T ? B : C),
+                                        (transa == CUBLAS_OP_T ? C : B),
+                                        A,
+                                        CUBLAS_OP_N,
+                                        op_b,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+                                        static_cast<cublasGemmAlgo_t>(algo));
+        });
+
+        int algo_bw2 = Run(loops, [=](int algo) {
+            // A need to transpose.
+            cublasOperation_t op_a = (transa == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+
+            int stride_a = M * K;
+            int stride_b = M * N;
+            int stride_c = N * K;
+
+            // Calculate d_B.
+            cublas_strided_batched_gemm(handle,
+                                        K,
+                                        N,
+                                        M,
+                                        &alpha,
+                                        &beta,
+                                        A,
+                                        C,
+                                        B,
+                                        op_a,
+                                        CUBLAS_OP_N,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+                                        static_cast<cublasGemmAlgo_t>(algo));
+        });
+
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+    }
+
+    template <typename Func>
+    int Run(int loops, Func f)
+    {
+        float fast_latency = (std::numeric_limits<float>::max)();
+        int fast_algo = 0;
+
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+             algo++) {
+            int warm_up = 5;
+            for (int i = 0; i < warm_up; ++i) f(algo);
+
+            cudaDeviceSynchronize();
+            Stopwatch timer;
+            timer.Restart();
+
+            for (int i = 0; i < loops; ++i) f(algo);
+
+            cudaDeviceSynchronize();
+            timer.Stop();
+
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
+
+            if (avg_latency < fast_latency) {
+                fast_latency = avg_latency;
+                fast_algo = algo;
+            }
+        }
+
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+
+        return fast_algo;
+    }
+
+private:
+    int bsz, M, N, K;
+    cublasHandle_t handle;
+    cublasOperation_t transa, transb;
+    T *A, *B, *C;
+};
--- a/csrc/includes/general_kernels.h
+++ b/csrc/includes/general_kernels.h
@ -1,47 +1,47 @@
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <cooperative_groups.h>
-#include <curand_kernel.h>
-
-#include "context.h"
-#include "cublas_wrappers.h"
-
-#define THREADS 256
-#define TILE_DIM 32
-
-#define minus_infinity -1 * std::numeric_limits<float>::infinity()
-
-#define FINAL_MASK 0xffffffff
-
-template <typename T>
-void launch_fused_add2(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       cudaStream_t& stream);
-
-template <typename T>
-void launch_fused_add4(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       const T* inp3,
-                       const T* inp4,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       cudaStream_t& stream);
-
-template <typename T>
-void launch_fused_add3(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       const T* inp3,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       cudaStream_t& stream);
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cooperative_groups.h>
+#include <curand_kernel.h>
+
+#include "context.h"
+#include "cublas_wrappers.h"
+
+#define THREADS 256
+#define TILE_DIM 32
+
+#define minus_infinity -1 * std::numeric_limits<float>::infinity()
+
+#define FINAL_MASK 0xffffffff
+
+template <typename T>
+void launch_fused_add2(T* out,
+                       const T* inp1,
+                       const T* inp2,
+                       int batch_size,
+                       int seq_length,
+                       int hidden_size,
+                       cudaStream_t& stream);
+
+template <typename T>
+void launch_fused_add4(T* out,
+                       const T* inp1,
+                       const T* inp2,
+                       const T* inp3,
+                       const T* inp4,
+                       int batch_size,
+                       int seq_length,
+                       int hidden_size,
+                       cudaStream_t& stream);
+
+template <typename T>
+void launch_fused_add3(T* out,
+                       const T* inp1,
+                       const T* inp2,
+                       const T* inp3,
+                       int batch_size,
+                       int seq_length,
+                       int hidden_size,
+                       cudaStream_t& stream);
--- a/csrc/includes/normalize_layer.h
+++ b/csrc/includes/normalize_layer.h
@ -1,202 +1,202 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <fstream>
-#include "custom_cuda_layers.h"
-
-using namespace std;
-
-template <typename T>
-class Normalize_Layer {
-public:
-    struct Config {
-        uint32_t batchSize;
-        uint32_t seqLength;
-        uint32_t hiddenDim;
-        float epsilon;
-        bool training;
-        bool useMean;
-        Config(uint32_t batch,
-               uint32_t seq,
-               uint32_t h,
-               float epsilon = 1e-12,
-               bool training = true,
-               bool useMean = true)
-            : batchSize(batch),
-              seqLength(seq),
-              hiddenDim(h),
-              epsilon(epsilon),
-              training(training),
-              useMean(useMean)
-        {
-        }
-    };
-
-    Normalize_Layer(Config config)
-        : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr)
-    {
-    }
-
-    ~Normalize_Layer() {}
-
-    void ForwardCheckpoint(int bsz,  // batch * seq
-                           T* vals,
-                           const T* residual,
-                           const T* gamma,
-                           const T* betta,
-                           cudaStream_t& stream,
-                           bool preLayerNorm = false)
-    {
-        launch_bias_residual_layer_norm(vals,
-                                        residual,
-                                        gamma,
-                                        betta,
-                                        config_.epsilon,
-                                        bsz,
-                                        config_.hiddenDim,
-                                        stream,
-                                        preLayerNorm,
-                                        config_.training,
-                                        vars,
-                                        means);
-    }
-
-    void Forward(int bsz,
-                 T* vals,
-                 const T* residual,
-                 const T* gamma,
-                 const T* betta,
-                 cudaStream_t& stream,
-                 bool preLayerNorm = false)
-    {
-        launch_bias_residual_layer_norm(vals,
-                                        residual,
-                                        gamma,
-                                        betta,
-                                        config_.epsilon,
-                                        bsz,
-                                        config_.hiddenDim,
-                                        stream,
-                                        preLayerNorm,
-                                        config_.training,
-                                        vars);
-    }
-
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* gamma,
-                  T* gamma_grad,
-                  T* betta_grad,
-                  cudaStream_t stream[2],
-                  T* inp_grad_out,
-                  const T* norm_in = nullptr)
-    {
-        launch_layerNorm_backward(out_grad,
-                                  norm_in,
-                                  vars,
-                                  means,
-                                  gamma,
-                                  gamma_grad,
-                                  betta_grad,
-                                  inp_grad_out,
-                                  bsz,
-                                  config_.hiddenDim,
-                                  stream);
-    }
-
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* gamma,
-                  const T* betta,
-                  T* gamma_grad,
-                  T* betta_grad,
-                  cudaStream_t stream[2],
-                  T* inp_grad_out,
-                  const T* norm_out)
-    {
-        launch_layerNorm_backward(out_grad,
-                                  norm_out,
-                                  vars,
-                                  gamma,
-                                  gamma_grad,
-                                  betta_grad,
-                                  inp_grad_out,
-                                  bsz,
-                                  config_.hiddenDim,
-                                  stream,
-                                  !config_.useMean,
-                                  betta);
-    }
-
-    void BackwardFusedAdd(int bsz,
-                          const T* out_grad1,
-                          const T* out_grad2,
-                          const T* gamma,
-                          T* gamma_grad,
-                          T* betta_grad,
-                          cudaStream_t stream[2],
-                          T* inp_grad_out,
-                          const T* norm_in = nullptr)
-    {
-        launch_layerNorm_backward_fused_add(out_grad1,
-                                            out_grad2,
-                                            norm_in,
-                                            vars,
-                                            means,
-                                            gamma,
-                                            gamma_grad,
-                                            betta_grad,
-                                            inp_grad_out,
-                                            bsz,
-                                            config_.hiddenDim,
-                                            stream);
-    }
-
-    void BackwardFusedAdd(int bsz,
-                          const T* out_grad1,
-                          const T* out_grad2,
-                          const T* gamma,
-                          const T* betta,
-                          T* gamma_grad,
-                          T* betta_grad,
-                          cudaStream_t stream[2],
-                          T* inp_grad_out,
-                          const T* norm_out)
-    {
-        launch_layerNorm_backward_fused_add(out_grad1,
-                                            out_grad2,
-                                            norm_out,
-                                            vars,
-                                            gamma,
-                                            gamma_grad,
-                                            betta_grad,
-                                            inp_grad_out,
-                                            bsz,
-                                            config_.hiddenDim,
-                                            stream,
-                                            !config_.useMean,
-                                            betta);
-    }
-
-    inline bool UseMean() const { return config_.useMean; }
-
-    inline void SetVar(T* variance)
-    {
-        if (!variance) { throw std::runtime_error("Normalize variance is null."); }
-        vars = variance;
-    }
-
-    inline void SetMean(T* mean)
-    {
-        if (!mean) { throw std::runtime_error("Normalize mean is null."); }
-        means = mean;
-    }
-
-private:
-    Config config_;
-    T* vars;
-    T* means;
-    T* vals_hat;
-};
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <fstream>
+#include "custom_cuda_layers.h"
+
+using namespace std;
+
+template <typename T>
+class Normalize_Layer {
+public:
+    struct Config {
+        uint32_t batchSize;
+        uint32_t seqLength;
+        uint32_t hiddenDim;
+        float epsilon;
+        bool training;
+        bool useMean;
+        Config(uint32_t batch,
+               uint32_t seq,
+               uint32_t h,
+               float epsilon = 1e-12,
+               bool training = true,
+               bool useMean = true)
+            : batchSize(batch),
+              seqLength(seq),
+              hiddenDim(h),
+              epsilon(epsilon),
+              training(training),
+              useMean(useMean)
+        {
+        }
+    };
+
+    Normalize_Layer(Config config)
+        : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr)
+    {
+    }
+
+    ~Normalize_Layer() {}
+
+    void ForwardCheckpoint(int bsz,  // batch * seq
+                           T* vals,
+                           const T* residual,
+                           const T* gamma,
+                           const T* betta,
+                           cudaStream_t& stream,
+                           bool preLayerNorm = false)
+    {
+        launch_bias_residual_layer_norm(vals,
+                                        residual,
+                                        gamma,
+                                        betta,
+                                        config_.epsilon,
+                                        bsz,
+                                        config_.hiddenDim,
+                                        stream,
+                                        preLayerNorm,
+                                        config_.training,
+                                        vars,
+                                        means);
+    }
+
+    void Forward(int bsz,
+                 T* vals,
+                 const T* residual,
+                 const T* gamma,
+                 const T* betta,
+                 cudaStream_t& stream,
+                 bool preLayerNorm = false)
+    {
+        launch_bias_residual_layer_norm(vals,
+                                        residual,
+                                        gamma,
+                                        betta,
+                                        config_.epsilon,
+                                        bsz,
+                                        config_.hiddenDim,
+                                        stream,
+                                        preLayerNorm,
+                                        config_.training,
+                                        vars);
+    }
+
+    void Backward(int bsz,
+                  const T* out_grad,
+                  const T* gamma,
+                  T* gamma_grad,
+                  T* betta_grad,
+                  cudaStream_t stream[2],
+                  T* inp_grad_out,
+                  const T* norm_in = nullptr)
+    {
+        launch_layerNorm_backward(out_grad,
+                                  norm_in,
+                                  vars,
+                                  means,
+                                  gamma,
+                                  gamma_grad,
+                                  betta_grad,
+                                  inp_grad_out,
+                                  bsz,
+                                  config_.hiddenDim,
+                                  stream);
+    }
+
+    void Backward(int bsz,
+                  const T* out_grad,
+                  const T* gamma,
+                  const T* betta,
+                  T* gamma_grad,
+                  T* betta_grad,
+                  cudaStream_t stream[2],
+                  T* inp_grad_out,
+                  const T* norm_out)
+    {
+        launch_layerNorm_backward(out_grad,
+                                  norm_out,
+                                  vars,
+                                  gamma,
+                                  gamma_grad,
+                                  betta_grad,
+                                  inp_grad_out,
+                                  bsz,
+                                  config_.hiddenDim,
+                                  stream,
+                                  !config_.useMean,
+                                  betta);
+    }
+
+    void BackwardFusedAdd(int bsz,
+                          const T* out_grad1,
+                          const T* out_grad2,
+                          const T* gamma,
+                          T* gamma_grad,
+                          T* betta_grad,
+                          cudaStream_t stream[2],
+                          T* inp_grad_out,
+                          const T* norm_in = nullptr)
+    {
+        launch_layerNorm_backward_fused_add(out_grad1,
+                                            out_grad2,
+                                            norm_in,
+                                            vars,
+                                            means,
+                                            gamma,
+                                            gamma_grad,
+                                            betta_grad,
+                                            inp_grad_out,
+                                            bsz,
+                                            config_.hiddenDim,
+                                            stream);
+    }
+
+    void BackwardFusedAdd(int bsz,
+                          const T* out_grad1,
+                          const T* out_grad2,
+                          const T* gamma,
+                          const T* betta,
+                          T* gamma_grad,
+                          T* betta_grad,
+                          cudaStream_t stream[2],
+                          T* inp_grad_out,
+                          const T* norm_out)
+    {
+        launch_layerNorm_backward_fused_add(out_grad1,
+                                            out_grad2,
+                                            norm_out,
+                                            vars,
+                                            gamma,
+                                            gamma_grad,
+                                            betta_grad,
+                                            inp_grad_out,
+                                            bsz,
+                                            config_.hiddenDim,
+                                            stream,
+                                            !config_.useMean,
+                                            betta);
+    }
+
+    inline bool UseMean() const { return config_.useMean; }
+
+    inline void SetVar(T* variance)
+    {
+        if (!variance) { throw std::runtime_error("Normalize variance is null."); }
+        vars = variance;
+    }
+
+    inline void SetMean(T* mean)
+    {
+        if (!mean) { throw std::runtime_error("Normalize mean is null."); }
+        means = mean;
+    }
+
+private:
+    Config config_;
+    T* vars;
+    T* means;
+    T* vals_hat;
+};
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@ -1,137 +1,137 @@
-#pragma once
-
-#if (__x86_64__ || __i386__)
-#include <cpuid.h>
-#include <x86intrin.h>
-#endif
-
-#define TILE (128 * 1024 * 1024)
-#if defined(__AVX512__) or defined(__AVX256__)
-
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
-
-#if defined(__AVX512__)
-#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm512_loadu_ps(x)
-#define SIMD_SET(x) _mm512_set1_ps(x)
-#define SIMD_ADD(x, y) _mm512_add_ps(x, y)
-#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
-#define SIMD_WIDTH 16
-
-#define SIMD_LOAD2(x, h) \
-    ((h) ? _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)x)) : _mm512_loadu_ps(x))
-#define SIMD_STORE2(x, d, h)                                                                      \
-    ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
-         : _mm512_storeu_ps(x, d))
-
-#define INTV __m256i
-#elif defined(__AVX256__)
-#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm256_loadu_ps(x)
-#define SIMD_SET(x) _mm256_set1_ps(x)
-#define SIMD_ADD(x, y) _mm256_add_ps(x, y)
-#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
-#define SIMD_WIDTH 8
-#define SIMD_LOAD2(x, h) \
-    ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
-
-#define SIMD_STORE2(x, d, h)                                                                \
-    ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
-         : _mm256_storeu_ps(x, d))
-
-#define INTV __m128i
-#endif
-
-union AVX_Data {
-#if defined(__AVX512__)
-    __m512 data;
-#elif defined(__AVX256__)
-    __m256 data;
-#endif
-    // float data_f[16];
-};
-
-template <int span>
-inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) {
-        SIMD_STORE2(dst + SIMD_WIDTH * i, src[i].data, half_precision);
-    }
-}
-template <int span>
-inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_LOAD2(src + SIMD_WIDTH * i, half_precision);
-    }
-}
-template <int span>
-inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a[i].data);
-    }
-}
-template <int span>
-inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data src_a)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a.data);
-    }
-}
-template <int span>
-inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data* src_m_r, AVX_Data* src_a)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r[i].data, src_a[i].data);
-    }
-}
-template <int span>
-inline void simd_sqrt(AVX_Data* dst, AVX_Data* src)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_SQRT(src[i].data); }
-}
-template <int span>
-inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r.data); }
-}
-template <int span>
-inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r[i].data); }
-}
-template <int span>
-inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r.data); }
-}
-template <int span>
-inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r[i].data); }
-}
-template <int span>
-inline void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_DIV(src_a_l[i].data, src_a_r[i].data); }
-}
-
-#endif
+#pragma once
+
+#if (__x86_64__ || __i386__)
+#include <cpuid.h>
+#include <x86intrin.h>
+#endif
+
+#define TILE (128 * 1024 * 1024)
+#if defined(__AVX512__) or defined(__AVX256__)
+
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+
+#if defined(__AVX512__)
+#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm512_loadu_ps(x)
+#define SIMD_SET(x) _mm512_set1_ps(x)
+#define SIMD_ADD(x, y) _mm512_add_ps(x, y)
+#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_WIDTH 16
+
+#define SIMD_LOAD2(x, h) \
+    ((h) ? _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)x)) : _mm512_loadu_ps(x))
+#define SIMD_STORE2(x, d, h)                                                                      \
+    ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
+         : _mm512_storeu_ps(x, d))
+
+#define INTV __m256i
+#elif defined(__AVX256__)
+#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm256_loadu_ps(x)
+#define SIMD_SET(x) _mm256_set1_ps(x)
+#define SIMD_ADD(x, y) _mm256_add_ps(x, y)
+#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_WIDTH 8
+#define SIMD_LOAD2(x, h) \
+    ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
+
+#define SIMD_STORE2(x, d, h)                                                                \
+    ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
+         : _mm256_storeu_ps(x, d))
+
+#define INTV __m128i
+#endif
+
+union AVX_Data {
+#if defined(__AVX512__)
+    __m512 data;
+#elif defined(__AVX256__)
+    __m256 data;
+#endif
+    // float data_f[16];
+};
+
+template <int span>
+inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) {
+        SIMD_STORE2(dst + SIMD_WIDTH * i, src[i].data, half_precision);
+    }
+}
+template <int span>
+inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_LOAD2(src + SIMD_WIDTH * i, half_precision);
+    }
+}
+template <int span>
+inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a[i].data);
+    }
+}
+template <int span>
+inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data src_a)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a.data);
+    }
+}
+template <int span>
+inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data* src_m_r, AVX_Data* src_a)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r[i].data, src_a[i].data);
+    }
+}
+template <int span>
+inline void simd_sqrt(AVX_Data* dst, AVX_Data* src)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_SQRT(src[i].data); }
+}
+template <int span>
+inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_DIV(src_a_l[i].data, src_a_r[i].data); }
+}
+
+#endif
--- a/csrc/includes/softmax.h
+++ b/csrc/includes/softmax.h
@ -1,60 +1,60 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include "custom_cuda_layers.h"
-
-#include <fstream>
-
-using namespace std;
-
-template <typename T>
-class Softmax {
-public:
-    struct Config {
-        size_t batchSize;
-        size_t heads;
-        size_t seq_length;
-        size_t prob_depth;
-        float temperature;
-        bool mem_alloc;
-        Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
-            : batchSize(batch),
-              heads(h),
-              seq_length(seq),
-              prob_depth(prob_size),
-              temperature(1.0),
-              mem_alloc(mem_alloc)
-        {
-        }
-    };
-
-    Softmax(Config config) : config_(config) {}
-
-    ~Softmax() {}
-
-    void Forward(int bsz, T* vals, const T* attn_mask, cudaStream_t& stream)
-    {
-        launch_attn_softmax<T>(vals, attn_mask, bsz, config_.heads, config_.seq_length, stream);
-    }
-
-    void Backward(int bsz, T* out_grad, const T* soft_out, cudaStream_t stream)
-    {
-        launch_attn_softmax_backward_v2<T>(
-            out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream);
-    }
-
-    inline size_t GetProbDepth() const { return config_.prob_depth; }
-
-    inline size_t GetBatchSize() const { return config_.batchSize; }
-
-    inline size_t GetNumHeads() const { return config_.heads; }
-
-    inline size_t GetSeqLength() const { return config_.seq_length; }
-
-    inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }
-
-private:
-    Config config_;
-};
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include "custom_cuda_layers.h"
+
+#include <fstream>
+
+using namespace std;
+
+template <typename T>
+class Softmax {
+public:
+    struct Config {
+        size_t batchSize;
+        size_t heads;
+        size_t seq_length;
+        size_t prob_depth;
+        float temperature;
+        bool mem_alloc;
+        Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
+            : batchSize(batch),
+              heads(h),
+              seq_length(seq),
+              prob_depth(prob_size),
+              temperature(1.0),
+              mem_alloc(mem_alloc)
+        {
+        }
+    };
+
+    Softmax(Config config) : config_(config) {}
+
+    ~Softmax() {}
+
+    void Forward(int bsz, T* vals, const T* attn_mask, cudaStream_t& stream)
+    {
+        launch_attn_softmax<T>(vals, attn_mask, bsz, config_.heads, config_.seq_length, stream);
+    }
+
+    void Backward(int bsz, T* out_grad, const T* soft_out, cudaStream_t stream)
+    {
+        launch_attn_softmax_backward_v2<T>(
+            out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream);
+    }
+
+    inline size_t GetProbDepth() const { return config_.prob_depth; }
+
+    inline size_t GetBatchSize() const { return config_.batchSize; }
+
+    inline size_t GetNumHeads() const { return config_.heads; }
+
+    inline size_t GetSeqLength() const { return config_.seq_length; }
+
+    inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }
+
+private:
+    Config config_;
+};
--- a/csrc/includes/strided_batch_gemm.h
+++ b/csrc/includes/strided_batch_gemm.h
@ -1,179 +1,179 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include "context.h"
-
-template <typename T>
-class StridedBatchGemm {
-public:
-    struct Config {
-        int batch_size;
-        int m;
-        int n;
-        int k;
-        float alpha;
-        float beta;
-        cublasOperation_t op_A;
-        cublasOperation_t op_B;
-        std::array<int, 3> gemm_algos;
-
-        Config(int batch,
-               int mm,
-               int nn,
-               int kk,
-               float param_alpha,
-               float param_beta,
-               cublasOperation_t opA,
-               cublasOperation_t opB,
-               const std::array<int, 3>& algos)
-            : batch_size(batch),
-              m(mm),
-              n(nn),
-              k(kk),
-              alpha(param_alpha),
-              beta(param_beta),
-              op_A(opA),
-              op_B(opB),
-              gemm_algos(algos)
-        {
-        }
-        void SetConfig(int mm, int nn, int kk)
-        {
-            m = mm;
-            n = nn;
-            k = kk;
-        }
-    };
-
-    StridedBatchGemm(const Config& config) : _config(config) {}
-
-    virtual ~StridedBatchGemm() {}
-
-    void Forward(int bsz, T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
-    {
-        int stride_a = _config.m * _config.k;
-        int stride_b = _config.n * _config.k;
-        int stride_c = _config.m * _config.n;
-
-        cublas_strided_batched_gemm(handle,
-                                    _config.m,
-                                    _config.n,
-                                    _config.k,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    _buffer_b,
-                                    output,
-                                    _config.op_A,
-                                    _config.op_B,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
-    }
-
-    void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
-    {
-        int stride_a = _config.m * _config.k;
-        int stride_b = _config.n * _config.k;
-        int stride_c = _config.m * _config.n;
-
-        cublas_strided_batched_gemm(handle,
-                                    _config.m,
-                                    _config.n,
-                                    _config.k,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    _buffer_b,
-                                    output,
-                                    _config.op_A,
-                                    _config.op_B,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    _config.batch_size,
-                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
-
-        k_buf = _buffer_a;
-        q_buf = _buffer_b;
-    }
-
-    void Backward(int bsz,
-                  const T* d_output,
-                  const T* _buffer_a,
-                  const T* _buffer_b,
-                  cublasHandle_t handle,
-                  T* inpGradA = nullptr,
-                  T* inpGradB = nullptr)
-    {
-        int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m);
-        int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k);
-
-        int stride_a = mb * _config.n;
-        int stride_b = _config.n * kb;
-        int stride_c = _config.m * _config.k;
-
-        // B need to transpose.
-        cublasOperation_t op_b = (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-        // Calculate d_A.
-        cublas_strided_batched_gemm(handle,
-                                    mb,
-                                    kb,
-                                    _config.n,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output),
-                                    (_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b),
-                                    inpGradA,
-                                    CUBLAS_OP_N,
-                                    op_b,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-                                    cublasGemmAlgo_t(_config.gemm_algos[1]));
-
-        // A need to transpose.
-        cublasOperation_t op_a = (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-        stride_a = _config.m * _config.k;
-        stride_b = _config.m * _config.n;
-        stride_c = _config.n * _config.k;
-
-        // Calculate d_B.
-        cublas_strided_batched_gemm(handle,
-                                    _config.k,
-                                    _config.n,
-                                    _config.m,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    d_output,
-                                    inpGradB,
-                                    op_a,
-                                    CUBLAS_OP_N,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-                                    cublasGemmAlgo_t(_config.gemm_algos[2]));
-    }
-
-    inline int GetN() const { return _config.k; }
-
-    inline const T* GetBufferA() const { return k_buf; }
-
-    inline const T* GetBufferB() const { return q_buf; }
-
-    inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
-
-private:
-    Config _config;
-    const T* q_buf;
-    const T* k_buf;
-};
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include "context.h"
+
+template <typename T>
+class StridedBatchGemm {
+public:
+    struct Config {
+        int batch_size;
+        int m;
+        int n;
+        int k;
+        float alpha;
+        float beta;
+        cublasOperation_t op_A;
+        cublasOperation_t op_B;
+        std::array<int, 3> gemm_algos;
+
+        Config(int batch,
+               int mm,
+               int nn,
+               int kk,
+               float param_alpha,
+               float param_beta,
+               cublasOperation_t opA,
+               cublasOperation_t opB,
+               const std::array<int, 3>& algos)
+            : batch_size(batch),
+              m(mm),
+              n(nn),
+              k(kk),
+              alpha(param_alpha),
+              beta(param_beta),
+              op_A(opA),
+              op_B(opB),
+              gemm_algos(algos)
+        {
+        }
+        void SetConfig(int mm, int nn, int kk)
+        {
+            m = mm;
+            n = nn;
+            k = kk;
+        }
+    };
+
+    StridedBatchGemm(const Config& config) : _config(config) {}
+
+    virtual ~StridedBatchGemm() {}
+
+    void Forward(int bsz, T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
+    {
+        int stride_a = _config.m * _config.k;
+        int stride_b = _config.n * _config.k;
+        int stride_c = _config.m * _config.n;
+
+        cublas_strided_batched_gemm(handle,
+                                    _config.m,
+                                    _config.n,
+                                    _config.k,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    _buffer_a,
+                                    _buffer_b,
+                                    output,
+                                    _config.op_A,
+                                    _config.op_B,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    bsz,
+                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
+    }
+
+    void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
+    {
+        int stride_a = _config.m * _config.k;
+        int stride_b = _config.n * _config.k;
+        int stride_c = _config.m * _config.n;
+
+        cublas_strided_batched_gemm(handle,
+                                    _config.m,
+                                    _config.n,
+                                    _config.k,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    _buffer_a,
+                                    _buffer_b,
+                                    output,
+                                    _config.op_A,
+                                    _config.op_B,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    _config.batch_size,
+                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
+
+        k_buf = _buffer_a;
+        q_buf = _buffer_b;
+    }
+
+    void Backward(int bsz,
+                  const T* d_output,
+                  const T* _buffer_a,
+                  const T* _buffer_b,
+                  cublasHandle_t handle,
+                  T* inpGradA = nullptr,
+                  T* inpGradB = nullptr)
+    {
+        int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m);
+        int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k);
+
+        int stride_a = mb * _config.n;
+        int stride_b = _config.n * kb;
+        int stride_c = _config.m * _config.k;
+
+        // B need to transpose.
+        cublasOperation_t op_b = (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+
+        // Calculate d_A.
+        cublas_strided_batched_gemm(handle,
+                                    mb,
+                                    kb,
+                                    _config.n,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output),
+                                    (_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b),
+                                    inpGradA,
+                                    CUBLAS_OP_N,
+                                    op_b,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    bsz,
+                                    cublasGemmAlgo_t(_config.gemm_algos[1]));
+
+        // A need to transpose.
+        cublasOperation_t op_a = (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+
+        stride_a = _config.m * _config.k;
+        stride_b = _config.m * _config.n;
+        stride_c = _config.n * _config.k;
+
+        // Calculate d_B.
+        cublas_strided_batched_gemm(handle,
+                                    _config.k,
+                                    _config.n,
+                                    _config.m,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    _buffer_a,
+                                    d_output,
+                                    inpGradB,
+                                    op_a,
+                                    CUBLAS_OP_N,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    bsz,
+                                    cublasGemmAlgo_t(_config.gemm_algos[2]));
+    }
+
+    inline int GetN() const { return _config.k; }
+
+    inline const T* GetBufferA() const { return k_buf; }
+
+    inline const T* GetBufferB() const { return q_buf; }
+
+    inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
+
+private:
+    Config _config;
+    const T* q_buf;
+    const T* k_buf;
+};
--- a/csrc/transformer/dropout_kernels.cu
+++ b/csrc/transformer/dropout_kernels.cu
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@ -1,330 +1,330 @@
-#include "custom_cuda_layers.h"
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-inline __device__ float d_gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-
-    float x2mul = x * x * mul_param;
-    float tan_h = tanhf(sqrt_param * (x + x * x2mul));
-    float dg1 = 0.5f * (1.0f + tan_h);
-    float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
-    float dg3 = dg2 * 3 * x2mul;
-    return (dg1 + dg2 + dg3);
-}
-
-/*
-Fused bias add with GELU
-
-Loads a vector of 4 elements each iteration, for stride
-iterations. It was written with the intention to launch 256 thread
-threadblocks, so to launch for bert-large, we would set ITERATIONS
-to 4. This is currently done automatically as a heuristic, setting
-the number of iterations as blocks of 1024.
-
-For FP16, the values are loaded from memory as __half, but converted
-to FP32 for the arithmetic itself, to prevent numerous overflow on
-the intermediate hyperbolic tangent, since there's no intrinsic
-that computes it directly.
-*/
-
-__global__ void gelu_kernel(const float* input, float* vals, int row_stride, int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
-{
-#if __CUDA_ARCH__ >= 700
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void fused_bias_gelu(const float* input,
-                                const float* bias,
-                                float* vals,
-                                int row_stride,
-                                int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 data = input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            data.x += bias_data.x;
-            data.y += bias_data.y;
-            data.z += bias_data.z;
-            data.w += bias_data.w;
-
-            data.x = gelu(data.x);
-            data.y = gelu(data.y);
-            data.z = gelu(data.z);
-            data.w = gelu(data.w);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = data;
-        }
-    }
-}
-
-__global__ void fused_bias_gelu(const __half* input,
-                                const __half* bias,
-                                __half* vals,
-                                int row_stride,
-                                int iterations)
-{
-#if __CUDA_ARCH__ >= 700
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    const float2* input_cast = reinterpret_cast<const float2*>(input);
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 low_data = __half22float2(vals_half[0]);
-            float2 high_data = __half22float2(vals_half[1]);
-
-            float2 low_bias = __half22float2(bias_half[0]);
-            float2 high_bias = __half22float2(bias_half[1]);
-
-            low_data.x += low_bias.x;
-            low_data.y += low_bias.y;
-            high_data.x += high_bias.x;
-            high_data.y += high_bias.y;
-
-            low_data.x = gelu(low_data.x);
-            low_data.y = gelu(low_data.y);
-            high_data.x = gelu(high_data.x);
-            high_data.y = gelu(high_data.y);
-
-            vals_half[0] = __float22half2_rn(low_data);
-            vals_half[1] = __float22half2_rn(high_data);
-
-            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
-        }
-    }
-#endif
-}
-
-__global__ void d_gelu_func(float* d_output,
-                            const float* gelu_input,
-                            const float* bias,
-                            int row_stride,
-                            int iterations)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float4* d_output_cast = reinterpret_cast<float4*>(d_output);
-    const float4* gelu_input_cast = reinterpret_cast<const float4*>(gelu_input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float4 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float4 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float4 bias_data = bias_cast[i * loop_stride + id];
-
-            gelu_input_data.x += bias_data.x;
-            gelu_input_data.y += bias_data.y;
-            gelu_input_data.z += bias_data.z;
-            gelu_input_data.w += bias_data.w;
-
-            output_data.x *= d_gelu(gelu_input_data.x);
-            output_data.y *= d_gelu(gelu_input_data.y);
-            output_data.z *= d_gelu(gelu_input_data.z);
-            output_data.w *= d_gelu(gelu_input_data.w);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = output_data;
-        }
-    }
-}
-
-__global__ void d_gelu_func(__half* d_output,
-                            const __half* gelu_input,
-                            const __half* bias,
-                            int row_stride,
-                            int iterations)
-{
-#if __CUDA_ARCH__ >= 700
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int loop_stride = blockDim.x;
-
-    float2* d_output_cast = reinterpret_cast<float2*>(d_output);
-    const float2* gelu_input_cast = reinterpret_cast<const float2*>(gelu_input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-#pragma unroll
-    for (int i = 0; i < iterations; i++) {
-        if (i * loop_stride + id < row_stride) {
-            float2 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
-            float2 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
-            float2 bias_vec = bias_cast[i * loop_stride + id];
-
-            __half2* output_data_half = reinterpret_cast<__half2*>(&output_data);
-            __half2* gelu_input_data_half = reinterpret_cast<__half2*>(&gelu_input_data);
-            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-            float2 output_half_0 = __half22float2(output_data_half[0]);
-            float2 output_half_1 = __half22float2(output_data_half[1]);
-
-            float2 gelu_input_half_0 = __half22float2(gelu_input_data_half[0]);
-            float2 gelu_input_half_1 = __half22float2(gelu_input_data_half[1]);
-
-            float2 bias_half_0 = __half22float2(bias_half[0]);
-            float2 bias_half_1 = __half22float2(bias_half[1]);
-
-            gelu_input_half_0.x += bias_half_0.x;
-            gelu_input_half_0.y += bias_half_0.y;
-            gelu_input_half_1.x += bias_half_1.x;
-            gelu_input_half_1.y += bias_half_1.y;
-
-            output_half_0.x *= d_gelu(gelu_input_half_0.x);
-            output_half_0.y *= d_gelu(gelu_input_half_0.y);
-            output_half_1.x *= d_gelu(gelu_input_half_1.x);
-            output_half_1.y *= d_gelu(gelu_input_half_1.y);
-
-            float2 result;
-            __half2* result_half2 = reinterpret_cast<__half2*>(&result);
-
-            result_half2[0] = __float22half2_rn(output_half_0);
-            result_half2[1] = __float22half2_rn(output_half_1);
-
-            d_output_cast[row * row_stride + i * loop_stride + id] = result;
-        }
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(const T* input,
-                      const T* bias,
-                      T* output,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
-        input, bias, output, intermediate_size / 4, iterations);
-}
-
-template <typename T>
-void launch_gelu(const T* input,
-                 T* output,
-                 int intermediate_size,
-                 int batch_size,
-                 cudaStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-    gelu_kernel<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, intermediate_size / 4, iterations);
-}
-
-template void launch_bias_gelu<float>(const float*, const float*, float*, int, int, cudaStream_t);
-template void launch_bias_gelu<__half>(const __half*,
-                                       const __half*,
-                                       __half*,
-                                       int,
-                                       int,
-                                       cudaStream_t);
-
-template void launch_gelu<float>(const float*, float*, int, int, cudaStream_t);
-template void launch_gelu<__half>(const __half*, __half*, int, int, cudaStream_t);
-
-template <typename T>
-void launch_d_gelu(T* d_output,
-                   const T* input,
-                   const T* bias,
-                   int intermediate_size,
-                   int batch_size,
-                   cudaStream_t stream)
-{
-    int iterations = (intermediate_size + 1023) / 1024;
-    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
-    dim3 block_dims(threads);
-    dim3 grid_dims(batch_size);
-
-    d_gelu_func<<<grid_dims, block_dims, 0, stream>>>(
-        d_output, input, bias, intermediate_size / 4, iterations);
-}
-
-template void launch_d_gelu<float>(float*, const float*, const float*, int, int, cudaStream_t);
-template void launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, cudaStream_t);
+#include "custom_cuda_layers.h"
+
+inline __device__ float gelu(const float x)
+{
+    const float sqrt_param = 0.79788456080286535587989211986876f;
+    const float mul_param = 0.044715;
+    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
+}
+
+inline __device__ float d_gelu(const float x)
+{
+    const float sqrt_param = 0.79788456080286535587989211986876f;
+    const float mul_param = 0.044715;
+
+    float x2mul = x * x * mul_param;
+    float tan_h = tanhf(sqrt_param * (x + x * x2mul));
+    float dg1 = 0.5f * (1.0f + tan_h);
+    float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
+    float dg3 = dg2 * 3 * x2mul;
+    return (dg1 + dg2 + dg3);
+}
+
+/*
+Fused bias add with GELU
+
+Loads a vector of 4 elements each iteration, for stride
+iterations. It was written with the intention to launch 256 thread
+threadblocks, so to launch for bert-large, we would set ITERATIONS
+to 4. This is currently done automatically as a heuristic, setting
+the number of iterations as blocks of 1024.
+
+For FP16, the values are loaded from memory as __half, but converted
+to FP32 for the arithmetic itself, to prevent numerous overflow on
+the intermediate hyperbolic tangent, since there's no intrinsic
+that computes it directly.
+*/
+
+__global__ void gelu_kernel(const float* input, float* vals, int row_stride, int iterations)
+{
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int loop_stride = blockDim.x;
+
+    const float4* input_cast = reinterpret_cast<const float4*>(input);
+    float4* vals_cast = reinterpret_cast<float4*>(vals);
+
+    for (int i = 0; i < iterations; i++) {
+        if (i * loop_stride + id < row_stride) {
+            float4 data = input_cast[row * row_stride + i * loop_stride + id];
+
+            data.x = gelu(data.x);
+            data.y = gelu(data.y);
+            data.z = gelu(data.z);
+            data.w = gelu(data.w);
+
+            vals_cast[row * row_stride + i * loop_stride + id] = data;
+        }
+    }
+}
+
+__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
+{
+#if __CUDA_ARCH__ >= 700
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int loop_stride = blockDim.x;
+
+    const float2* input_cast = reinterpret_cast<const float2*>(input);
+    float2* vals_cast = reinterpret_cast<float2*>(vals);
+
+    for (int i = 0; i < iterations; i++) {
+        if (i * loop_stride + id < row_stride) {
+            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
+
+            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
+
+            float2 low_data = __half22float2(vals_half[0]);
+            float2 high_data = __half22float2(vals_half[1]);
+
+            low_data.x = gelu(low_data.x);
+            low_data.y = gelu(low_data.y);
+            high_data.x = gelu(high_data.x);
+            high_data.y = gelu(high_data.y);
+
+            vals_half[0] = __float22half2_rn(low_data);
+            vals_half[1] = __float22half2_rn(high_data);
+
+            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
+        }
+    }
+#endif
+}
+
+__global__ void fused_bias_gelu(const float* input,
+                                const float* bias,
+                                float* vals,
+                                int row_stride,
+                                int iterations)
+{
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int loop_stride = blockDim.x;
+
+    const float4* input_cast = reinterpret_cast<const float4*>(input);
+    float4* vals_cast = reinterpret_cast<float4*>(vals);
+    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
+
+    for (int i = 0; i < iterations; i++) {
+        if (i * loop_stride + id < row_stride) {
+            float4 data = input_cast[row * row_stride + i * loop_stride + id];
+            float4 bias_data = bias_cast[i * loop_stride + id];
+
+            data.x += bias_data.x;
+            data.y += bias_data.y;
+            data.z += bias_data.z;
+            data.w += bias_data.w;
+
+            data.x = gelu(data.x);
+            data.y = gelu(data.y);
+            data.z = gelu(data.z);
+            data.w = gelu(data.w);
+
+            vals_cast[row * row_stride + i * loop_stride + id] = data;
+        }
+    }
+}
+
+__global__ void fused_bias_gelu(const __half* input,
+                                const __half* bias,
+                                __half* vals,
+                                int row_stride,
+                                int iterations)
+{
+#if __CUDA_ARCH__ >= 700
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int loop_stride = blockDim.x;
+
+    const float2* input_cast = reinterpret_cast<const float2*>(input);
+    float2* vals_cast = reinterpret_cast<float2*>(vals);
+    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
+
+    for (int i = 0; i < iterations; i++) {
+        if (i * loop_stride + id < row_stride) {
+            float2 vals_vec = input_cast[row * row_stride + i * loop_stride + id];
+            float2 bias_vec = bias_cast[i * loop_stride + id];
+
+            __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
+            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
+
+            float2 low_data = __half22float2(vals_half[0]);
+            float2 high_data = __half22float2(vals_half[1]);
+
+            float2 low_bias = __half22float2(bias_half[0]);
+            float2 high_bias = __half22float2(bias_half[1]);
+
+            low_data.x += low_bias.x;
+            low_data.y += low_bias.y;
+            high_data.x += high_bias.x;
+            high_data.y += high_bias.y;
+
+            low_data.x = gelu(low_data.x);
+            low_data.y = gelu(low_data.y);
+            high_data.x = gelu(high_data.x);
+            high_data.y = gelu(high_data.y);
+
+            vals_half[0] = __float22half2_rn(low_data);
+            vals_half[1] = __float22half2_rn(high_data);
+
+            vals_cast[row * row_stride + i * loop_stride + id] = vals_vec;
+        }
+    }
+#endif
+}
+
+__global__ void d_gelu_func(float* d_output,
+                            const float* gelu_input,
+                            const float* bias,
+                            int row_stride,
+                            int iterations)
+{
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int loop_stride = blockDim.x;
+
+    float4* d_output_cast = reinterpret_cast<float4*>(d_output);
+    const float4* gelu_input_cast = reinterpret_cast<const float4*>(gelu_input);
+    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
+
+    for (int i = 0; i < iterations; i++) {
+        if (i * loop_stride + id < row_stride) {
+            float4 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
+            float4 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
+            float4 bias_data = bias_cast[i * loop_stride + id];
+
+            gelu_input_data.x += bias_data.x;
+            gelu_input_data.y += bias_data.y;
+            gelu_input_data.z += bias_data.z;
+            gelu_input_data.w += bias_data.w;
+
+            output_data.x *= d_gelu(gelu_input_data.x);
+            output_data.y *= d_gelu(gelu_input_data.y);
+            output_data.z *= d_gelu(gelu_input_data.z);
+            output_data.w *= d_gelu(gelu_input_data.w);
+
+            d_output_cast[row * row_stride + i * loop_stride + id] = output_data;
+        }
+    }
+}
+
+__global__ void d_gelu_func(__half* d_output,
+                            const __half* gelu_input,
+                            const __half* bias,
+                            int row_stride,
+                            int iterations)
+{
+#if __CUDA_ARCH__ >= 700
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int loop_stride = blockDim.x;
+
+    float2* d_output_cast = reinterpret_cast<float2*>(d_output);
+    const float2* gelu_input_cast = reinterpret_cast<const float2*>(gelu_input);
+    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
+
+#pragma unroll
+    for (int i = 0; i < iterations; i++) {
+        if (i * loop_stride + id < row_stride) {
+            float2 output_data = d_output_cast[row * row_stride + i * loop_stride + id];
+            float2 gelu_input_data = gelu_input_cast[row * row_stride + i * loop_stride + id];
+            float2 bias_vec = bias_cast[i * loop_stride + id];
+
+            __half2* output_data_half = reinterpret_cast<__half2*>(&output_data);
+            __half2* gelu_input_data_half = reinterpret_cast<__half2*>(&gelu_input_data);
+            __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
+
+            float2 output_half_0 = __half22float2(output_data_half[0]);
+            float2 output_half_1 = __half22float2(output_data_half[1]);
+
+            float2 gelu_input_half_0 = __half22float2(gelu_input_data_half[0]);
+            float2 gelu_input_half_1 = __half22float2(gelu_input_data_half[1]);
+
+            float2 bias_half_0 = __half22float2(bias_half[0]);
+            float2 bias_half_1 = __half22float2(bias_half[1]);
+
+            gelu_input_half_0.x += bias_half_0.x;
+            gelu_input_half_0.y += bias_half_0.y;
+            gelu_input_half_1.x += bias_half_1.x;
+            gelu_input_half_1.y += bias_half_1.y;
+
+            output_half_0.x *= d_gelu(gelu_input_half_0.x);
+            output_half_0.y *= d_gelu(gelu_input_half_0.y);
+            output_half_1.x *= d_gelu(gelu_input_half_1.x);
+            output_half_1.y *= d_gelu(gelu_input_half_1.y);
+
+            float2 result;
+            __half2* result_half2 = reinterpret_cast<__half2*>(&result);
+
+            result_half2[0] = __float22half2_rn(output_half_0);
+            result_half2[1] = __float22half2_rn(output_half_1);
+
+            d_output_cast[row * row_stride + i * loop_stride + id] = result;
+        }
+    }
+#endif
+}
+
+template <typename T>
+void launch_bias_gelu(const T* input,
+                      const T* bias,
+                      T* output,
+                      int intermediate_size,
+                      int batch_size,
+                      cudaStream_t stream)
+{
+    int iterations = (intermediate_size + 1023) / 1024;
+    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
+    dim3 block_dims(threads);
+    dim3 grid_dims(batch_size);
+
+    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
+        input, bias, output, intermediate_size / 4, iterations);
+}
+
+template <typename T>
+void launch_gelu(const T* input,
+                 T* output,
+                 int intermediate_size,
+                 int batch_size,
+                 cudaStream_t stream)
+{
+    int iterations = (intermediate_size + 1023) / 1024;
+    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
+    dim3 block_dims(threads);
+    dim3 grid_dims(batch_size);
+
+    gelu_kernel<<<grid_dims, block_dims, 0, stream>>>(
+        input, output, intermediate_size / 4, iterations);
+}
+
+template void launch_bias_gelu<float>(const float*, const float*, float*, int, int, cudaStream_t);
+template void launch_bias_gelu<__half>(const __half*,
+                                       const __half*,
+                                       __half*,
+                                       int,
+                                       int,
+                                       cudaStream_t);
+
+template void launch_gelu<float>(const float*, float*, int, int, cudaStream_t);
+template void launch_gelu<__half>(const __half*, __half*, int, int, cudaStream_t);
+
+template <typename T>
+void launch_d_gelu(T* d_output,
+                   const T* input,
+                   const T* bias,
+                   int intermediate_size,
+                   int batch_size,
+                   cudaStream_t stream)
+{
+    int iterations = (intermediate_size + 1023) / 1024;
+    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
+    dim3 block_dims(threads);
+    dim3 grid_dims(batch_size);
+
+    d_gelu_func<<<grid_dims, block_dims, 0, stream>>>(
+        d_output, input, bias, intermediate_size / 4, iterations);
+}
+
+template void launch_d_gelu<float>(float*, const float*, const float*, int, int, cudaStream_t);
+template void launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, cudaStream_t);
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@ -1,411 +1,411 @@
-#include "general_kernels.h"
-
-namespace cg = cooperative_groups;
-
-template <typename T>
-__global__ void column_sum_reduce(const T* __restrict__ inp,
-                                  T* __restrict__ out,
-                                  int rows,
-                                  int width)
-{
-    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-    int y_stride = width * TILE_DIM;
-
-    float localSum = 0;
-
-    // Loop across matrix height
-    if (idx < width) {
-        int offset = threadIdx.y * width + idx;
-        for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-            localSum += (float)inp[offset];
-            offset += y_stride;
-        }
-    }
-
-    tile[threadIdx.x][threadIdx.y] = localSum;
-
-    __syncthreads();
-
-    // Sum the shared buffer.
-    float sum = tile[threadIdx.y][threadIdx.x];
-
-#ifndef __STOCHASTIC_MODE__
-    __syncthreads();
-#endif
-
-    for (int i = 1; i < TILE_DIM; i <<= 1) sum += g.shfl_down(sum, i);
-
-    if (threadIdx.x == 0) {
-        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        if (pos < width) out[pos] = sum;
-    }
-}
-
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T* inp,
-                                       T* out,
-                                       int rows,
-                                       int cols,
-                                       cudaStream_t stream);
-
-template <>
-void launch_fuse_transpose_bias_kernel<float>(const float* inp,
-                                              float* out,
-                                              int rows,
-                                              int cols,
-                                              cudaStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    column_sum_reduce<float><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
-}
-
-template <>
-void launch_fuse_transpose_bias_kernel<__half>(const __half* inp,
-                                               __half* out,
-                                               int rows,
-                                               int cols,
-                                               cudaStream_t stream)
-{
-    // assert(rows % TILE_DIM == 0);
-    // assert(cols % TILE_DIM == 0);
-
-    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
-    dim3 block_dim(TILE_DIM, TILE_DIM);
-
-    column_sum_reduce<__half><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
-}
-
-__global__ void fused_add2_kernel(const int N, float* out, const float* inp1, const float* inp2)
-{
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        float4 val;
-        float4 inp1_reg = inp1_4[j];
-        float4 inp2_reg = inp2_4[j];
-
-        val.x = inp1_reg.x + inp2_reg.x;
-        val.y = inp1_reg.y + inp2_reg.y;
-        val.z = inp1_reg.z + inp2_reg.z;
-        val.w = inp1_reg.w + inp2_reg.w;
-
-        out_4[j] = val;
-    }
-}
-
-__global__ void fused_add2_kernel(const int N, __half* out, const __half* inp1, const __half* inp2)
-{
-    float2 inp1_4;
-    float2 inp2_4;
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-
-    CUDA_1D_KERNEL_LOOP(j, N)
-    {
-        inp1_4 = inp1_arr[j];
-        inp2_4 = inp2_arr[j];
-
-        float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-        float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-        float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-        float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-        inp1_h_f_0.x += inp2_h_f_0.x;
-        inp1_h_f_0.y += inp2_h_f_0.y;
-        inp1_h_f_1.x += inp2_h_f_1.x;
-        inp1_h_f_1.y += inp2_h_f_1.y;
-
-        float2 val_f;
-        __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-        val_h[0] = __float22half2_rn(inp1_h_f_0);
-        val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-        float2* out_4 = reinterpret_cast<float2*>(out);
-        out_4[j] = val_f;
-    }
-}
-
-template <>
-void launch_fused_add2<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_dim,
-                              cudaStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
-}
-
-template <>
-void launch_fused_add2<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_dim,
-                               cudaStream_t& stream)
-{
-    int total_count = batch_size * seq_length * hidden_dim / 4;
-    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
-
-    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
-
-    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
-}
-
-__global__ void fused_add3_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add3_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add3<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add3_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add3<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add3_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-__global__ void fused_add4_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  const float* inp3,
-                                  const float* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
-    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
-    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
-    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
-    const float4* inp4_4 = reinterpret_cast<const float4*>(inp4);
-    float4* out_4 = reinterpret_cast<float4*>(out);
-
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
-    float4 inp3_reg = inp3_4[row * row_stride + id];
-    float4 inp4_reg = inp4_4[row * row_stride + id];
-
-    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x + inp4_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y + inp4_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z + inp4_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w + inp4_reg.w;
-
-    out_4[row * row_stride + id] = val;
-}
-
-__global__ void fused_add4_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  const __half* inp3,
-                                  const __half* inp4,
-                                  int size,
-                                  int row_stride)
-{
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
-    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
-    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
-    const float2* inp4_arr = reinterpret_cast<const float2*>(inp4);
-
-    float2 inp1_4 = inp1_arr[row * row_stride + id];
-    float2 inp2_4 = inp2_arr[row * row_stride + id];
-    float2 inp3_4 = inp3_arr[row * row_stride + id];
-    float2 inp4_4 = inp4_arr[row * row_stride + id];
-
-    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
-    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
-    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
-    __half2* inp4_h = reinterpret_cast<__half2*>(&inp4_4);
-
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
-
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
-
-    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
-    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
-
-    float2 inp4_h_f_0 = __half22float2(inp4_h[0]);
-    float2 inp4_h_f_1 = __half22float2(inp4_h[1]);
-
-    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x + inp4_h_f_0.x);
-    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y + inp4_h_f_0.y);
-    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x + inp4_h_f_1.x);
-    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y + inp4_h_f_1.y);
-
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
-
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
-
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
-}
-
-template <>
-void launch_fused_add4<float>(float* out,
-                              const float* inp1,
-                              const float* inp2,
-                              const float* inp3,
-                              const float* inp4,
-                              int batch_size,
-                              int seq_length,
-                              int hidden_size,
-                              cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add4_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
-
-template <>
-void launch_fused_add4<__half>(__half* out,
-                               const __half* inp1,
-                               const __half* inp2,
-                               const __half* inp3,
-                               const __half* inp4,
-                               int batch_size,
-                               int seq_length,
-                               int hidden_size,
-                               cudaStream_t& stream)
-{
-    dim3 grid_dim(batch_size * seq_length);
-
-    dim3 block_dim(hidden_size / 4);
-
-    fused_add4_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
-}
+#include "general_kernels.h"
+
+namespace cg = cooperative_groups;
+
+template <typename T>
+__global__ void column_sum_reduce(const T* __restrict__ inp,
+                                  T* __restrict__ out,
+                                  int rows,
+                                  int width)
+{
+    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+    int y_stride = width * TILE_DIM;
+
+    float localSum = 0;
+
+    // Loop across matrix height
+    if (idx < width) {
+        int offset = threadIdx.y * width + idx;
+        for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+            localSum += (float)inp[offset];
+            offset += y_stride;
+        }
+    }
+
+    tile[threadIdx.x][threadIdx.y] = localSum;
+
+    __syncthreads();
+
+    // Sum the shared buffer.
+    float sum = tile[threadIdx.y][threadIdx.x];
+
+#ifndef __STOCHASTIC_MODE__
+    __syncthreads();
+#endif
+
+    for (int i = 1; i < TILE_DIM; i <<= 1) sum += g.shfl_down(sum, i);
+
+    if (threadIdx.x == 0) {
+        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+        if (pos < width) out[pos] = sum;
+    }
+}
+
+template <typename T>
+void launch_fuse_transpose_bias_kernel(const T* inp,
+                                       T* out,
+                                       int rows,
+                                       int cols,
+                                       cudaStream_t stream);
+
+template <>
+void launch_fuse_transpose_bias_kernel<float>(const float* inp,
+                                              float* out,
+                                              int rows,
+                                              int cols,
+                                              cudaStream_t stream)
+{
+    // assert(rows % TILE_DIM == 0);
+    // assert(cols % TILE_DIM == 0);
+
+    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    column_sum_reduce<float><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
+}
+
+template <>
+void launch_fuse_transpose_bias_kernel<__half>(const __half* inp,
+                                               __half* out,
+                                               int rows,
+                                               int cols,
+                                               cudaStream_t stream)
+{
+    // assert(rows % TILE_DIM == 0);
+    // assert(cols % TILE_DIM == 0);
+
+    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
+    dim3 block_dim(TILE_DIM, TILE_DIM);
+
+    column_sum_reduce<__half><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
+}
+
+__global__ void fused_add2_kernel(const int N, float* out, const float* inp1, const float* inp2)
+{
+    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
+    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
+    float4* out_4 = reinterpret_cast<float4*>(out);
+
+    CUDA_1D_KERNEL_LOOP(j, N)
+    {
+        float4 val;
+        float4 inp1_reg = inp1_4[j];
+        float4 inp2_reg = inp2_4[j];
+
+        val.x = inp1_reg.x + inp2_reg.x;
+        val.y = inp1_reg.y + inp2_reg.y;
+        val.z = inp1_reg.z + inp2_reg.z;
+        val.w = inp1_reg.w + inp2_reg.w;
+
+        out_4[j] = val;
+    }
+}
+
+__global__ void fused_add2_kernel(const int N, __half* out, const __half* inp1, const __half* inp2)
+{
+    float2 inp1_4;
+    float2 inp2_4;
+
+    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
+    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
+
+    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
+    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
+
+    CUDA_1D_KERNEL_LOOP(j, N)
+    {
+        inp1_4 = inp1_arr[j];
+        inp2_4 = inp2_arr[j];
+
+        float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
+        float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
+
+        float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
+        float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
+
+        inp1_h_f_0.x += inp2_h_f_0.x;
+        inp1_h_f_0.y += inp2_h_f_0.y;
+        inp1_h_f_1.x += inp2_h_f_1.x;
+        inp1_h_f_1.y += inp2_h_f_1.y;
+
+        float2 val_f;
+        __half2* val_h = reinterpret_cast<__half2*>(&val_f);
+
+        val_h[0] = __float22half2_rn(inp1_h_f_0);
+        val_h[1] = __float22half2_rn(inp1_h_f_1);
+
+        float2* out_4 = reinterpret_cast<float2*>(out);
+        out_4[j] = val_f;
+    }
+}
+
+template <>
+void launch_fused_add2<float>(float* out,
+                              const float* inp1,
+                              const float* inp2,
+                              int batch_size,
+                              int seq_length,
+                              int hidden_dim,
+                              cudaStream_t& stream)
+{
+    int total_count = batch_size * seq_length * hidden_dim / 4;
+    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
+
+    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
+
+    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
+}
+
+template <>
+void launch_fused_add2<__half>(__half* out,
+                               const __half* inp1,
+                               const __half* inp2,
+                               int batch_size,
+                               int seq_length,
+                               int hidden_dim,
+                               cudaStream_t& stream)
+{
+    int total_count = batch_size * seq_length * hidden_dim / 4;
+    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
+
+    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
+
+    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
+}
+
+__global__ void fused_add3_kernel(float* out,
+                                  const float* inp1,
+                                  const float* inp2,
+                                  const float* inp3,
+                                  int size,
+                                  int row_stride)
+{
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+
+    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
+    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
+    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
+
+    float4* out_4 = reinterpret_cast<float4*>(out);
+
+    float4 val;
+    float4 inp1_reg = inp1_4[row * row_stride + id];
+    float4 inp2_reg = inp2_4[row * row_stride + id];
+    float4 inp3_reg = inp3_4[row * row_stride + id];
+
+    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x;
+    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y;
+    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z;
+    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w;
+
+    out_4[row * row_stride + id] = val;
+}
+
+__global__ void fused_add3_kernel(__half* out,
+                                  const __half* inp1,
+                                  const __half* inp2,
+                                  const __half* inp3,
+                                  int size,
+                                  int row_stride)
+{
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
+    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
+    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
+
+    float2 inp1_4 = inp1_arr[row * row_stride + id];
+    float2 inp2_4 = inp2_arr[row * row_stride + id];
+    float2 inp3_4 = inp3_arr[row * row_stride + id];
+
+    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
+    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
+    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
+
+    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
+    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
+
+    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
+    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
+
+    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
+    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
+
+    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x);
+    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y);
+    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x);
+    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y);
+
+    float2 val_f;
+    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
+
+    val_h[0] = __float22half2_rn(inp1_h_f_0);
+    val_h[1] = __float22half2_rn(inp1_h_f_1);
+
+    float2* out_4 = reinterpret_cast<float2*>(out);
+    out_4[row * row_stride + id] = val_f;
+}
+
+template <>
+void launch_fused_add3<float>(float* out,
+                              const float* inp1,
+                              const float* inp2,
+                              const float* inp3,
+                              int batch_size,
+                              int seq_length,
+                              int hidden_size,
+                              cudaStream_t& stream)
+{
+    dim3 grid_dim(batch_size * seq_length);
+
+    dim3 block_dim(hidden_size / 4);
+
+    fused_add3_kernel<<<grid_dim, block_dim, 0, stream>>>(
+        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
+}
+
+template <>
+void launch_fused_add3<__half>(__half* out,
+                               const __half* inp1,
+                               const __half* inp2,
+                               const __half* inp3,
+                               int batch_size,
+                               int seq_length,
+                               int hidden_size,
+                               cudaStream_t& stream)
+{
+    dim3 grid_dim(batch_size * seq_length);
+
+    dim3 block_dim(hidden_size / 4);
+
+    fused_add3_kernel<<<grid_dim, block_dim, 0, stream>>>(
+        out, inp1, inp2, inp3, (batch_size * seq_length * hidden_size), hidden_size / 4);
+}
+
+__global__ void fused_add4_kernel(float* out,
+                                  const float* inp1,
+                                  const float* inp2,
+                                  const float* inp3,
+                                  const float* inp4,
+                                  int size,
+                                  int row_stride)
+{
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+
+    const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
+    const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
+    const float4* inp3_4 = reinterpret_cast<const float4*>(inp3);
+    const float4* inp4_4 = reinterpret_cast<const float4*>(inp4);
+    float4* out_4 = reinterpret_cast<float4*>(out);
+
+    float4 val;
+    float4 inp1_reg = inp1_4[row * row_stride + id];
+    float4 inp2_reg = inp2_4[row * row_stride + id];
+    float4 inp3_reg = inp3_4[row * row_stride + id];
+    float4 inp4_reg = inp4_4[row * row_stride + id];
+
+    val.x = inp1_reg.x + inp2_reg.x + inp3_reg.x + inp4_reg.x;
+    val.y = inp1_reg.y + inp2_reg.y + inp3_reg.y + inp4_reg.y;
+    val.z = inp1_reg.z + inp2_reg.z + inp3_reg.z + inp4_reg.z;
+    val.w = inp1_reg.w + inp2_reg.w + inp3_reg.w + inp4_reg.w;
+
+    out_4[row * row_stride + id] = val;
+}
+
+__global__ void fused_add4_kernel(__half* out,
+                                  const __half* inp1,
+                                  const __half* inp2,
+                                  const __half* inp3,
+                                  const __half* inp4,
+                                  int size,
+                                  int row_stride)
+{
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
+    const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
+    const float2* inp3_arr = reinterpret_cast<const float2*>(inp3);
+    const float2* inp4_arr = reinterpret_cast<const float2*>(inp4);
+
+    float2 inp1_4 = inp1_arr[row * row_stride + id];
+    float2 inp2_4 = inp2_arr[row * row_stride + id];
+    float2 inp3_4 = inp3_arr[row * row_stride + id];
+    float2 inp4_4 = inp4_arr[row * row_stride + id];
+
+    __half2* inp1_h = reinterpret_cast<__half2*>(&inp1_4);
+    __half2* inp2_h = reinterpret_cast<__half2*>(&inp2_4);
+    __half2* inp3_h = reinterpret_cast<__half2*>(&inp3_4);
+    __half2* inp4_h = reinterpret_cast<__half2*>(&inp4_4);
+
+    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
+    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
+
+    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
+    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
+
+    float2 inp3_h_f_0 = __half22float2(inp3_h[0]);
+    float2 inp3_h_f_1 = __half22float2(inp3_h[1]);
+
+    float2 inp4_h_f_0 = __half22float2(inp4_h[0]);
+    float2 inp4_h_f_1 = __half22float2(inp4_h[1]);
+
+    inp1_h_f_0.x += (inp2_h_f_0.x + inp3_h_f_0.x + inp4_h_f_0.x);
+    inp1_h_f_0.y += (inp2_h_f_0.y + inp3_h_f_0.y + inp4_h_f_0.y);
+    inp1_h_f_1.x += (inp2_h_f_1.x + inp3_h_f_1.x + inp4_h_f_1.x);
+    inp1_h_f_1.y += (inp2_h_f_1.y + inp3_h_f_1.y + inp4_h_f_1.y);
+
+    float2 val_f;
+    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
+
+    val_h[0] = __float22half2_rn(inp1_h_f_0);
+    val_h[1] = __float22half2_rn(inp1_h_f_1);
+
+    float2* out_4 = reinterpret_cast<float2*>(out);
+    out_4[row * row_stride + id] = val_f;
+}
+
+template <>
+void launch_fused_add4<float>(float* out,
+                              const float* inp1,
+                              const float* inp2,
+                              const float* inp3,
+                              const float* inp4,
+                              int batch_size,
+                              int seq_length,
+                              int hidden_size,
+                              cudaStream_t& stream)
+{
+    dim3 grid_dim(batch_size * seq_length);
+
+    dim3 block_dim(hidden_size / 4);
+
+    fused_add4_kernel<<<grid_dim, block_dim, 0, stream>>>(
+        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
+}
+
+template <>
+void launch_fused_add4<__half>(__half* out,
+                               const __half* inp1,
+                               const __half* inp2,
+                               const __half* inp3,
+                               const __half* inp4,
+                               int batch_size,
+                               int seq_length,
+                               int hidden_size,
+                               cudaStream_t& stream)
+{
+    dim3 grid_dim(batch_size * seq_length);
+
+    dim3 block_dim(hidden_size / 4);
+
+    fused_add4_kernel<<<grid_dim, block_dim, 0, stream>>>(
+        out, inp1, inp2, inp3, inp4, (batch_size * seq_length * hidden_size), hidden_size / 4);
+}
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@ -1,110 +1,110 @@
-#include "custom_cuda_layers.h"
-
-#define MAX_QUANTIZE_GROUPING 1024
-
-#define loop_unroll 1
-#define loop_unroll_bits 1
-
-__global__ void dequantize_kernel(float* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  int output_size,
-                                  int hidden_dim,
-                                  int groups,
-                                  int merge_count)
-{
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = (scale_data * (float)q);
-        tid += blockDim.x;
-    }
-}
-
-__global__ void dequantize_kernel(__half* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  unsigned output_size,
-                                  unsigned hidden_dim,
-                                  unsigned groups,
-                                  unsigned merge_count)
-{
-#if __CUDA_ARCH__ >= 700
-
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = __float2half(scale_data * (float)q);
-        tid += blockDim.x;
-    }
-#endif
-}
-
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       cudaStream_t stream)
-{
-    unsigned threads = 1024;
-    dim3 block_dims(threads);
-    dim3 grid_dims(hidden_dim);
-
-    dequantize_kernel<<<grid_dims, block_dims, 0, stream>>>(
-        output, input, qscale, output_size, hidden_dim, groups, merge_count);
-}
-
-template void launch_dequantize<float>(float*,
-                                       const int8_t*,
-                                       const float*,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       cudaStream_t);
-template void launch_dequantize<__half>(__half*,
-                                        const int8_t*,
-                                        const float*,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        cudaStream_t);
+#include "custom_cuda_layers.h"
+
+#define MAX_QUANTIZE_GROUPING 1024
+
+#define loop_unroll 1
+#define loop_unroll_bits 1
+
+__global__ void dequantize_kernel(float* output,
+                                  const int8_t* input,
+                                  const float* qscale,
+                                  int output_size,
+                                  int hidden_dim,
+                                  int groups,
+                                  int merge_count)
+{
+    unsigned merge_hidden = hidden_dim >> merge_count;
+    unsigned quantization_stride = (merge_hidden * output_size) / groups;
+
+    unsigned bid = blockIdx.x;
+    unsigned tid = threadIdx.x;
+
+    while (tid < output_size) {
+        unsigned w_index = bid / merge_hidden;
+        unsigned q_index = tid + bid * output_size;
+
+        auto q = input[q_index];
+
+        unsigned merge_hidden_total = w_index * merge_hidden;
+        unsigned scale_index =
+            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
+             << merge_count) +
+            w_index;
+
+        float scale_data = qscale[scale_index];
+
+        output[q_index] = (scale_data * (float)q);
+        tid += blockDim.x;
+    }
+}
+
+__global__ void dequantize_kernel(__half* output,
+                                  const int8_t* input,
+                                  const float* qscale,
+                                  unsigned output_size,
+                                  unsigned hidden_dim,
+                                  unsigned groups,
+                                  unsigned merge_count)
+{
+#if __CUDA_ARCH__ >= 700
+
+    unsigned merge_hidden = hidden_dim >> merge_count;
+    unsigned quantization_stride = (merge_hidden * output_size) / groups;
+
+    unsigned bid = blockIdx.x;
+    unsigned tid = threadIdx.x;
+
+    while (tid < output_size) {
+        unsigned w_index = bid / merge_hidden;
+        unsigned q_index = tid + bid * output_size;
+
+        auto q = input[q_index];
+
+        unsigned merge_hidden_total = w_index * merge_hidden;
+        unsigned scale_index =
+            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
+             << merge_count) +
+            w_index;
+
+        float scale_data = qscale[scale_index];
+
+        output[q_index] = __float2half(scale_data * (float)q);
+        tid += blockDim.x;
+    }
+#endif
+}
+
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       unsigned merge_count,
+                       cudaStream_t stream)
+{
+    unsigned threads = 1024;
+    dim3 block_dims(threads);
+    dim3 grid_dims(hidden_dim);
+
+    dequantize_kernel<<<grid_dims, block_dims, 0, stream>>>(
+        output, input, qscale, output_size, hidden_dim, groups, merge_count);
+}
+
+template void launch_dequantize<float>(float*,
+                                       const int8_t*,
+                                       const float*,
+                                       unsigned,
+                                       unsigned,
+                                       unsigned,
+                                       unsigned,
+                                       cudaStream_t);
+template void launch_dequantize<__half>(__half*,
+                                        const int8_t*,
+                                        const float*,
+                                        unsigned,
+                                        unsigned,
+                                        unsigned,
+                                        unsigned,
+                                        cudaStream_t);
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@ -1,266 +1,266 @@
-#include "custom_cuda_layers.h"
-
-inline __device__ float gelu(const float x)
-{
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
-    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
-}
-
-__global__ void fused_bias_gelu(float* input,
-                                const float* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        data.x = gelu(data.x);
-        data.y = gelu(data.y);
-        data.z = gelu(data.z);
-        data.w = gelu(data.w);
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_gelu(__half* input,
-                                const __half* bias,
-                                int total_count,
-                                int intermediate_size)
-{
-#if __CUDA_ARCH__ >= 700
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        low_data.x = gelu(low_data.x);
-        low_data.y = gelu(low_data.y);
-        high_data.x = gelu(high_data.x);
-        high_data.y = gelu(high_data.y);
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream)
-{
-    int total_count = batch_size * (intermediate_size / 4);
-    int threads = 1024;  // intermediate_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / 1024 + 1));  // (batch_size);
-
-    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
-        input, bias, total_count, intermediate_size / 4);
-}
-
-template void launch_bias_gelu<float>(float*, const float*, int, int, cudaStream_t);
-template void launch_bias_gelu<__half>(__half*, const __half*, int, int, cudaStream_t);
-
-__global__ void fused_bias_add(float* input, const float* bias, int total_count, int hidden_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 bias_data = bias_cast[offset % hidden_size];
-
-        data.x += bias_data.x;
-        data.y += bias_data.y;
-        data.z += bias_data.z;
-        data.w += bias_data.w;
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
-{
-#if __CUDA_ARCH__ >= 700
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 bias_vec = bias_cast[offset % hidden_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += low_bias.x;
-        low_data.y += low_bias.y;
-        high_data.x += high_bias.x;
-        high_data.y += high_bias.y;
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream)
-{
-    int total_count = batch_size * (hidden_size / 4);
-    int threads = 1024;  // hidden_size / iterations / 4;
-    dim3 block_dims(threads);
-    dim3 grid_dims(((total_count - 1) / threads + 1));  // (batch_size);
-
-    fused_bias_add<<<grid_dims, block_dims, 0, stream>>>(input, bias, total_count, hidden_size / 4);
-}
-
-template void launch_bias_add<float>(float*, const float*, int, int, cudaStream_t);
-template void launch_bias_add<__half>(__half*, const __half*, int, int, cudaStream_t);
-
-__global__ void fused_bias_residual(float* input,
-                                    const float* residual,
-                                    const float* bias,
-                                    int total_count,
-                                    int intermediate_size)
-{
-    float4* input_cast = reinterpret_cast<float4*>(input);
-    const float4* residual_cast = reinterpret_cast<const float4*>(residual);
-    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float4 data = input_cast[offset];
-        float4 res_vec = residual_cast[offset];
-        float4 bias_data = bias_cast[offset % intermediate_size];
-
-        data.x += (res_vec.x + bias_data.x);
-        data.y += (res_vec.y + bias_data.y);
-        data.z += (res_vec.z + bias_data.z);
-        data.w += (res_vec.w + bias_data.w);
-
-        input_cast[offset] = data;
-    }
-}
-
-__global__ void fused_bias_residual(__half* input,
-                                    const __half* residual,
-                                    const __half* bias,
-                                    int total_count,
-                                    int intermediate_size)
-{
-#if __CUDA_ARCH__ >= 700
-
-    float2* input_cast = reinterpret_cast<float2*>(input);
-    const float2* residual_cast = reinterpret_cast<const float2*>(residual);
-
-    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
-
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (offset < total_count) {
-        float2 vals_vec = input_cast[offset];
-        float2 res_vec = residual_cast[offset];
-
-        float2 bias_vec = bias_cast[offset % intermediate_size];
-
-        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
-        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
-        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-
-        float2 low_data = __half22float2(vals_half[0]);
-        float2 high_data = __half22float2(vals_half[1]);
-
-        float2 low_res = __half22float2(res_half[0]);
-        float2 high_res = __half22float2(res_half[1]);
-
-        float2 low_bias = __half22float2(bias_half[0]);
-        float2 high_bias = __half22float2(bias_half[1]);
-
-        low_data.x += (low_res.x + low_bias.x);
-        low_data.y += (low_res.y + low_bias.y);
-        high_data.x += (high_res.x + high_bias.x);
-        high_data.y += (high_res.y + high_bias.y);
-
-        vals_half[0] = __float22half2_rn(low_data);
-        vals_half[1] = __float22half2_rn(high_data);
-
-        input_cast[offset] = vals_vec;
-    }
-#endif
-}
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          const T* residual,
-                          const T* bias,
-                          int batch,
-                          int intermediate_size,
-                          cudaStream_t stream)
-{
-    int total_count = batch * intermediate_size / 4;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
-
-    fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(
-        input, residual, bias, total_count, intermediate_size / 4);
-}
-
-template void launch_bias_residual<float>(float*,
-                                          const float*,
-                                          const float*,
-                                          int,
-                                          int,
-                                          cudaStream_t);
-template void launch_bias_residual<__half>(__half*,
-                                           const __half*,
-                                           const __half*,
-                                           int,
-                                           int,
-                                           cudaStream_t);
+#include "custom_cuda_layers.h"
+
+inline __device__ float gelu(const float x)
+{
+    const float sqrt_param = 0.79788456080286535587989211986876f;
+    const float mul_param = 0.044715;
+    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
+}
+
+__global__ void fused_bias_gelu(float* input,
+                                const float* bias,
+                                int total_count,
+                                int intermediate_size)
+{
+    float4* input_cast = reinterpret_cast<float4*>(input);
+    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
+    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (offset < total_count) {
+        float4 data = input_cast[offset];
+        float4 bias_data = bias_cast[offset % intermediate_size];
+
+        data.x += bias_data.x;
+        data.y += bias_data.y;
+        data.z += bias_data.z;
+        data.w += bias_data.w;
+
+        data.x = gelu(data.x);
+        data.y = gelu(data.y);
+        data.z = gelu(data.z);
+        data.w = gelu(data.w);
+
+        input_cast[offset] = data;
+    }
+}
+
+__global__ void fused_bias_gelu(__half* input,
+                                const __half* bias,
+                                int total_count,
+                                int intermediate_size)
+{
+#if __CUDA_ARCH__ >= 700
+
+    float2* input_cast = reinterpret_cast<float2*>(input);
+    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
+
+    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (offset < total_count) {
+        float2 vals_vec = input_cast[offset];
+        float2 bias_vec = bias_cast[offset % intermediate_size];
+
+        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
+        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
+
+        float2 low_data = __half22float2(vals_half[0]);
+        float2 high_data = __half22float2(vals_half[1]);
+
+        float2 low_bias = __half22float2(bias_half[0]);
+        float2 high_bias = __half22float2(bias_half[1]);
+
+        low_data.x += low_bias.x;
+        low_data.y += low_bias.y;
+        high_data.x += high_bias.x;
+        high_data.y += high_bias.y;
+
+        low_data.x = gelu(low_data.x);
+        low_data.y = gelu(low_data.y);
+        high_data.x = gelu(high_data.x);
+        high_data.y = gelu(high_data.y);
+
+        vals_half[0] = __float22half2_rn(low_data);
+        vals_half[1] = __float22half2_rn(high_data);
+
+        input_cast[offset] = vals_vec;
+    }
+#endif
+}
+
+template <typename T>
+void launch_bias_gelu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      cudaStream_t stream)
+{
+    int total_count = batch_size * (intermediate_size / 4);
+    int threads = 1024;  // intermediate_size / iterations / 4;
+    dim3 block_dims(threads);
+    dim3 grid_dims(((total_count - 1) / 1024 + 1));  // (batch_size);
+
+    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
+        input, bias, total_count, intermediate_size / 4);
+}
+
+template void launch_bias_gelu<float>(float*, const float*, int, int, cudaStream_t);
+template void launch_bias_gelu<__half>(__half*, const __half*, int, int, cudaStream_t);
+
+__global__ void fused_bias_add(float* input, const float* bias, int total_count, int hidden_size)
+{
+    float4* input_cast = reinterpret_cast<float4*>(input);
+    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
+    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (offset < total_count) {
+        float4 data = input_cast[offset];
+        float4 bias_data = bias_cast[offset % hidden_size];
+
+        data.x += bias_data.x;
+        data.y += bias_data.y;
+        data.z += bias_data.z;
+        data.w += bias_data.w;
+
+        input_cast[offset] = data;
+    }
+}
+
+__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
+{
+#if __CUDA_ARCH__ >= 700
+
+    float2* input_cast = reinterpret_cast<float2*>(input);
+    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
+
+    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (offset < total_count) {
+        float2 vals_vec = input_cast[offset];
+        float2 bias_vec = bias_cast[offset % hidden_size];
+
+        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
+        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
+
+        float2 low_data = __half22float2(vals_half[0]);
+        float2 high_data = __half22float2(vals_half[1]);
+
+        float2 low_bias = __half22float2(bias_half[0]);
+        float2 high_bias = __half22float2(bias_half[1]);
+
+        low_data.x += low_bias.x;
+        low_data.y += low_bias.y;
+        high_data.x += high_bias.x;
+        high_data.y += high_bias.y;
+
+        vals_half[0] = __float22half2_rn(low_data);
+        vals_half[1] = __float22half2_rn(high_data);
+
+        input_cast[offset] = vals_vec;
+    }
+#endif
+}
+
+template <typename T>
+void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream)
+{
+    int total_count = batch_size * (hidden_size / 4);
+    int threads = 1024;  // hidden_size / iterations / 4;
+    dim3 block_dims(threads);
+    dim3 grid_dims(((total_count - 1) / threads + 1));  // (batch_size);
+
+    fused_bias_add<<<grid_dims, block_dims, 0, stream>>>(input, bias, total_count, hidden_size / 4);
+}
+
+template void launch_bias_add<float>(float*, const float*, int, int, cudaStream_t);
+template void launch_bias_add<__half>(__half*, const __half*, int, int, cudaStream_t);
+
+__global__ void fused_bias_residual(float* input,
+                                    const float* residual,
+                                    const float* bias,
+                                    int total_count,
+                                    int intermediate_size)
+{
+    float4* input_cast = reinterpret_cast<float4*>(input);
+    const float4* residual_cast = reinterpret_cast<const float4*>(residual);
+    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
+    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (offset < total_count) {
+        float4 data = input_cast[offset];
+        float4 res_vec = residual_cast[offset];
+        float4 bias_data = bias_cast[offset % intermediate_size];
+
+        data.x += (res_vec.x + bias_data.x);
+        data.y += (res_vec.y + bias_data.y);
+        data.z += (res_vec.z + bias_data.z);
+        data.w += (res_vec.w + bias_data.w);
+
+        input_cast[offset] = data;
+    }
+}
+
+__global__ void fused_bias_residual(__half* input,
+                                    const __half* residual,
+                                    const __half* bias,
+                                    int total_count,
+                                    int intermediate_size)
+{
+#if __CUDA_ARCH__ >= 700
+
+    float2* input_cast = reinterpret_cast<float2*>(input);
+    const float2* residual_cast = reinterpret_cast<const float2*>(residual);
+
+    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
+
+    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (offset < total_count) {
+        float2 vals_vec = input_cast[offset];
+        float2 res_vec = residual_cast[offset];
+
+        float2 bias_vec = bias_cast[offset % intermediate_size];
+
+        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
+        __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
+        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
+
+        float2 low_data = __half22float2(vals_half[0]);
+        float2 high_data = __half22float2(vals_half[1]);
+
+        float2 low_res = __half22float2(res_half[0]);
+        float2 high_res = __half22float2(res_half[1]);
+
+        float2 low_bias = __half22float2(bias_half[0]);
+        float2 high_bias = __half22float2(bias_half[1]);
+
+        low_data.x += (low_res.x + low_bias.x);
+        low_data.y += (low_res.y + low_bias.y);
+        high_data.x += (high_res.x + high_bias.x);
+        high_data.y += (high_res.y + high_bias.y);
+
+        vals_half[0] = __float22half2_rn(low_data);
+        vals_half[1] = __float22half2_rn(high_data);
+
+        input_cast[offset] = vals_vec;
+    }
+#endif
+}
+
+template <typename T>
+void launch_bias_residual(T* input,
+                          const T* residual,
+                          const T* bias,
+                          int batch,
+                          int intermediate_size,
+                          cudaStream_t stream)
+{
+    int total_count = batch * intermediate_size / 4;
+    dim3 block_dims(1024);
+    dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
+
+    fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(
+        input, residual, bias, total_count, intermediate_size / 4);
+}
+
+template void launch_bias_residual<float>(float*,
+                                          const float*,
+                                          const float*,
+                                          int,
+                                          int,
+                                          cudaStream_t);
+template void launch_bias_residual<__half>(__half*,
+                                           const __half*,
+                                           const __half*,
+                                           int,
+                                           int,
+                                           cudaStream_t);
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ b/csrc/transformer/inference/csrc/normalize.cu
@ -1,426 +1,426 @@
-#include <limits>
-#include "custom_cuda_layers.h"
-
-#include <cuda_profiler_api.h>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define NORM_REG (MAX_REGISTERS)
-
-namespace cg = cooperative_groups;
-
-__global__ void fused_bias_residual_layer_norm(float* output,
-                                               const float* vals,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    float sum = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        output[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* output,
-                                               const __half* vals,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-#if __CUDA_ARCH__ >= 700
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    const __half2* vals_cast = reinterpret_cast<const __half2*>(vals);
-    __half2* out_cast = reinterpret_cast<__half2*>(output);
-
-    int k = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k++] = vals_cast[input_id + row * row_stride];
-        input_id += iteration_stride;
-    }
-    float sum = 0;
-    for (int f = k - 1; f >= 0; f--) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        sum += inp_f.x + inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        out_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       cudaStream_t stream);
-
-template <>
-void launch_layer_norm<float>(float* out,
-                              float* vals,
-                              const float* gamma,
-                              const float* beta,
-                              float epsilon,
-                              int batch_size,
-                              int hidden_dim,
-                              cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        out, vals, gamma, beta, epsilon, hidden_dim);
-}
-
-template <>
-void launch_layer_norm<__half>(__half* out,
-                               __half* vals,
-                               const __half* gamma,
-                               const __half* beta,
-                               float epsilon,
-                               int batch_size,
-                               int hidden_dim,
-                               cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        out, vals, gamma, beta, epsilon, hidden_dim / 2);
-}
-
-__global__ void fused_residual_layer_norm(float* norm,
-                                          float* res_add,
-                                          float* vals,
-                                          float* residual,
-                                          const float* bias,
-                                          const float* gamma,
-                                          const float* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN)
-{
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        float res_f = (residual[input_id + row * row_stride]);
-        float bias_f = (bias[input_id]);
-        inp_reg[k] += res_f + bias_f;
-        if (preLN) res_add[input_id + row * row_stride] = inp_reg[k];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        norm[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_residual_layer_norm(__half* norm,
-                                          __half* res_add,
-                                          __half* vals,
-                                          __half* residual,
-                                          const __half* bias,
-                                          const __half* gamma,
-                                          const __half* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN)
-{
-#if __CUDA_ARCH__ >= 700
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    __half2* norm_cast = reinterpret_cast<__half2*>(norm);
-    __half2* res_add_cast = reinterpret_cast<__half2*>(res_add);
-    __half2* residual_cast = reinterpret_cast<__half2*>(residual);
-    const __half2* bias_cast = reinterpret_cast<const __half2*>(bias);
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals_cast[input_id + row * row_stride];
-        float2 inp_f = __half22float2(inp_reg[k]);
-        float2 res_f = __half22float2(residual_cast[input_id + row * row_stride]);
-        float2 bias_f = __half22float2(bias_cast[input_id]);
-        inp_f.x += res_f.x + bias_f.x;
-        inp_f.y += res_f.y + bias_f.y;
-        inp_reg[k] = __float22half2_rn(inp_f);
-
-        if (preLN) res_add_cast[input_id + row * row_stride] = inp_reg[k];
-        sum += inp_f.x + inp_f.y;
-        input_id += iteration_stride;
-        k++;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        norm_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                cudaStream_t stream);
-
-template <>
-void launch_residual_layer_norm<float>(float* norm,
-                                       float* res_add,
-                                       float* vals,
-                                       float* residual,
-                                       const float* bias,
-                                       const float* gamma,
-                                       const float* beta,
-                                       float epsilon,
-                                       int batch_size,
-                                       int hidden_dim,
-                                       bool preLN,
-                                       cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        norm, res_add, vals, residual, bias, gamma, beta, epsilon, hidden_dim, preLN);
-}
-
-template <>
-void launch_residual_layer_norm<__half>(__half* norm,
-                                        __half* res_add,
-                                        __half* vals,
-                                        __half* residual,
-                                        const __half* bias,
-                                        const __half* gamma,
-                                        const __half* beta,
-                                        float epsilon,
-                                        int batch_size,
-                                        int hidden_dim,
-                                        bool preLN,
-                                        cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        norm, res_add, vals, residual, bias, gamma, beta, epsilon, hidden_dim / 2, preLN);
-}
+#include <limits>
+#include "custom_cuda_layers.h"
+
+#include <cuda_profiler_api.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+
+#define NORM_REG (MAX_REGISTERS)
+
+namespace cg = cooperative_groups;
+
+__global__ void fused_bias_residual_layer_norm(float* output,
+                                               const float* vals,
+                                               const float* gamma,
+                                               const float* beta,
+                                               float epsilon,
+                                               int row_stride)
+{
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id >> 5;
+    int warp_num = iteration_stride >> 5;
+
+    float inp_reg[NORM_REG];
+
+    int k = 0;
+    float sum = 0;
+    int input_id = id;
+    while (input_id < row_stride) {
+        inp_reg[k] = vals[input_id + row * row_stride];
+        sum += inp_reg[k++];
+        input_id += iteration_stride;
+    }
+
+    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
+
+    __shared__ float shr[MAX_WARP_NUM];
+
+    if (g.thread_rank() == 0) shr[gid] = sum;
+    b.sync();
+
+    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
+    b.sync();
+
+    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+
+    float mean = sum / (row_stride);
+    sum = 0.f;
+    for (int f = 0; f < k; f++) {
+        inp_reg[f] -= mean;
+        sum += inp_reg[f] * inp_reg[f];
+    }
+    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
+    if (g.thread_rank() == 0) shr[gid] = sum;
+    b.sync();
+
+    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
+    b.sync();
+
+    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= (row_stride);
+    sum += epsilon;
+    sum = __frsqrt_rn(sum);
+    for (int f = 0; f < k; f++) {
+        int out_id = f * iteration_stride + id;
+        inp_reg[f] = inp_reg[f] * sum;
+        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
+        output[out_id + row * row_stride] = inp_reg[f];
+    }
+}
+
+__global__ void fused_bias_residual_layer_norm(__half* output,
+                                               const __half* vals,
+                                               const __half* gamma,
+                                               const __half* beta,
+                                               float epsilon,
+                                               int row_stride)
+{
+#if __CUDA_ARCH__ >= 700
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id >> 5;
+    int warp_num = iteration_stride >> 5;
+
+    __half2 inp_reg[NORM_REG];
+
+    const __half2* vals_cast = reinterpret_cast<const __half2*>(vals);
+    __half2* out_cast = reinterpret_cast<__half2*>(output);
+
+    int k = 0;
+    int input_id = id;
+    while (input_id < row_stride) {
+        inp_reg[k++] = vals_cast[input_id + row * row_stride];
+        input_id += iteration_stride;
+    }
+    float sum = 0;
+    for (int f = k - 1; f >= 0; f--) {
+        float2 inp_f = __half22float2(inp_reg[f]);
+        sum += inp_f.x + inp_f.y;
+    }
+    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
+    __shared__ float shr[MAX_WARP_NUM];
+    if (g.thread_rank() == 0) shr[gid] = sum;
+    b.sync();
+    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
+    b.sync();
+    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    float mean = sum / (row_stride << 1);
+    sum = 0.f;
+    for (int f = 0; f < k; f++) {
+        float2 inp_f = __half22float2(inp_reg[f]);
+        inp_f.x -= mean;
+        inp_f.y -= mean;
+        inp_reg[f] = __float22half2_rn(inp_f);
+        sum += inp_f.x * inp_f.x;
+        sum += inp_f.y * inp_f.y;
+    }
+    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
+    if (g.thread_rank() == 0) shr[gid] = sum;
+    b.sync();
+    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
+    b.sync();
+    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= (row_stride << 1);
+    sum += epsilon;
+    sum = __frsqrt_rn(sum);
+    __half2 variance_h = __float2half2_rn(sum);
+    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
+    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
+    for (int f = 0; f < k; f++) {
+        int out_id = f * iteration_stride + id;
+        inp_reg[f] = inp_reg[f] * variance_h;
+        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
+        out_cast[out_id + row * row_stride] = inp_reg[f];
+    }
+#endif
+}
+
+template <typename T>
+void launch_layer_norm(T* out,
+                       T* vals,
+                       const T* gamma,
+                       const T* beta,
+                       float epsilon,
+                       int batch_size,
+                       int hidden_dim,
+                       cudaStream_t stream);
+
+template <>
+void launch_layer_norm<float>(float* out,
+                              float* vals,
+                              const float* gamma,
+                              const float* beta,
+                              float epsilon,
+                              int batch_size,
+                              int hidden_dim,
+                              cudaStream_t stream)
+{
+    constexpr int threads = 1024;
+
+    dim3 grid_dim(batch_size);
+
+    dim3 block_dim(threads);
+
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        out, vals, gamma, beta, epsilon, hidden_dim);
+}
+
+template <>
+void launch_layer_norm<__half>(__half* out,
+                               __half* vals,
+                               const __half* gamma,
+                               const __half* beta,
+                               float epsilon,
+                               int batch_size,
+                               int hidden_dim,
+                               cudaStream_t stream)
+{
+    constexpr int threads = 1024;
+
+    dim3 grid_dim(batch_size);
+    dim3 block_dim(threads);
+
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        out, vals, gamma, beta, epsilon, hidden_dim / 2);
+}
+
+__global__ void fused_residual_layer_norm(float* norm,
+                                          float* res_add,
+                                          float* vals,
+                                          float* residual,
+                                          const float* bias,
+                                          const float* gamma,
+                                          const float* beta,
+                                          float epsilon,
+                                          int row_stride,
+                                          bool preLN)
+{
+    int iteration_stride = blockDim.x;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id >> 5;
+    int warp_num = iteration_stride >> 5;
+
+    float inp_reg[NORM_REG];
+
+    int k = 0;
+    int input_id = id;
+
+    float sum = 0;
+    while (input_id < row_stride) {
+        inp_reg[k] = vals[input_id + row * row_stride];
+        float res_f = (residual[input_id + row * row_stride]);
+        float bias_f = (bias[input_id]);
+        inp_reg[k] += res_f + bias_f;
+        if (preLN) res_add[input_id + row * row_stride] = inp_reg[k];
+        sum += inp_reg[k++];
+        input_id += iteration_stride;
+    }
+    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
+
+    __shared__ float shr[MAX_WARP_NUM];
+    if (g.thread_rank() == 0) shr[gid] = sum;
+    b.sync();
+
+    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
+    b.sync();
+
+    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    float mean = sum / (row_stride);
+    sum = 0.f;
+    for (int f = 0; f < k; f++) {
+        inp_reg[f] -= mean;
+        sum += inp_reg[f] * inp_reg[f];
+    }
+    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
+    if (g.thread_rank() == 0) shr[gid] = sum;
+    b.sync();
+
+    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
+    b.sync();
+
+    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= (row_stride);
+    sum += epsilon;
+    sum = __frsqrt_rn(sum);
+
+    for (int f = 0; f < k; f++) {
+        int out_id = f * iteration_stride + id;
+        inp_reg[f] = inp_reg[f] * sum;
+        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
+        norm[out_id + row * row_stride] = inp_reg[f];
+    }
+}
+
+__global__ void fused_residual_layer_norm(__half* norm,
+                                          __half* res_add,
+                                          __half* vals,
+                                          __half* residual,
+                                          const __half* bias,
+                                          const __half* gamma,
+                                          const __half* beta,
+                                          float epsilon,
+                                          int row_stride,
+                                          bool preLN)
+{
+#if __CUDA_ARCH__ >= 700
+    int iteration_stride = blockDim.x;
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+
+    int row = blockIdx.x;
+    int id = threadIdx.x;
+    int gid = id >> 5;
+    int warp_num = iteration_stride >> 5;
+
+    __half2 inp_reg[NORM_REG];
+
+    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
+    __half2* norm_cast = reinterpret_cast<__half2*>(norm);
+    __half2* res_add_cast = reinterpret_cast<__half2*>(res_add);
+    __half2* residual_cast = reinterpret_cast<__half2*>(residual);
+    const __half2* bias_cast = reinterpret_cast<const __half2*>(bias);
+
+    int k = 0;
+    int input_id = id;
+
+    float sum = 0;
+    while (input_id < row_stride) {
+        inp_reg[k] = vals_cast[input_id + row * row_stride];
+        float2 inp_f = __half22float2(inp_reg[k]);
+        float2 res_f = __half22float2(residual_cast[input_id + row * row_stride]);
+        float2 bias_f = __half22float2(bias_cast[input_id]);
+        inp_f.x += res_f.x + bias_f.x;
+        inp_f.y += res_f.y + bias_f.y;
+        inp_reg[k] = __float22half2_rn(inp_f);
+
+        if (preLN) res_add_cast[input_id + row * row_stride] = inp_reg[k];
+        sum += inp_f.x + inp_f.y;
+        input_id += iteration_stride;
+        k++;
+    }
+    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
+    __shared__ float shr[MAX_WARP_NUM];
+    if (g.thread_rank() == 0) shr[gid] = sum;
+    b.sync();
+    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
+    b.sync();
+    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    float mean = sum / (row_stride << 1);
+    sum = 0.f;
+    for (int f = 0; f < k; f++) {
+        float2 inp_f = __half22float2(inp_reg[f]);
+        inp_f.x -= mean;
+        inp_f.y -= mean;
+        inp_reg[f] = __float22half2_rn(inp_f);
+        sum += inp_f.x * inp_f.x;
+        sum += inp_f.y * inp_f.y;
+    }
+    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
+    if (g.thread_rank() == 0) shr[gid] = sum;
+    b.sync();
+    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
+    b.sync();
+    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
+    sum = g.shfl(sum, 0);
+    sum /= (row_stride << 1);
+    sum += epsilon;
+    sum = __frsqrt_rn(sum);
+    __half2 variance_h = __float2half2_rn(sum);
+    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
+    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
+    for (int f = 0; f < k; f++) {
+        int out_id = f * iteration_stride + id;
+        inp_reg[f] = inp_reg[f] * variance_h;
+        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
+        norm_cast[out_id + row * row_stride] = inp_reg[f];
+    }
+#endif
+}
+
+template <typename T>
+void launch_residual_layer_norm(T* norm,
+                                T* res_add,
+                                T* vals,
+                                T* residual,
+                                const T* bias,
+                                const T* gamma,
+                                const T* beta,
+                                float epsilon,
+                                int batch_size,
+                                int hidden_dim,
+                                bool preLN,
+                                cudaStream_t stream);
+
+template <>
+void launch_residual_layer_norm<float>(float* norm,
+                                       float* res_add,
+                                       float* vals,
+                                       float* residual,
+                                       const float* bias,
+                                       const float* gamma,
+                                       const float* beta,
+                                       float epsilon,
+                                       int batch_size,
+                                       int hidden_dim,
+                                       bool preLN,
+                                       cudaStream_t stream)
+{
+    constexpr int threads = 1024;
+
+    dim3 grid_dim(batch_size);
+
+    dim3 block_dim(threads);
+
+    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        norm, res_add, vals, residual, bias, gamma, beta, epsilon, hidden_dim, preLN);
+}
+
+template <>
+void launch_residual_layer_norm<__half>(__half* norm,
+                                        __half* res_add,
+                                        __half* vals,
+                                        __half* residual,
+                                        const __half* bias,
+                                        const __half* gamma,
+                                        const __half* beta,
+                                        float epsilon,
+                                        int batch_size,
+                                        int hidden_dim,
+                                        bool preLN,
+                                        cudaStream_t stream)
+{
+    constexpr int threads = 1024;
+
+    dim3 grid_dim(batch_size);
+    dim3 block_dim(threads);
+
+    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        norm, res_add, vals, residual, bias, gamma, beta, epsilon, hidden_dim / 2, preLN);
+}
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@ -1,432 +1,432 @@
-#include <limits>
-#include "custom_cuda_layers.h"
-
-#include <cuda_profiler_api.h>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define ATTN_THREADS 1024
-#define MAX_REG_SIZE 8
-
-#define minus_infinity -10000.0
-
-void CheckCudaErrorAux(const char* file, unsigned line)
-{
-    cudaError_t err = cudaGetLastError();
-    if (err == cudaSuccess) return;
-    std::cerr << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line
-              << std::endl;
-    throw std::runtime_error("CUDA ERROR!!!\n");
-}
-
-#define CUDA_CHECK_ERROR() CheckCudaErrorAux(__FILE__, __LINE__)
-
-namespace cg = cooperative_groups;
-
-__global__ void attn_softmax_v2(__half* vals,
-                                __half* mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-#if __CUDA_ARCH__ >= 700
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float2 low_data[MAX_REG_SIZE];
-    float2 high_data[MAX_REG_SIZE];
-
-    __half2 h_scale = __float2half2_rn(scale);
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                     (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                      (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                      (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3])
-                                         : minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
-                    }
-                } else {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
-                    low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
-                                      (data_id + 1) > window_stride) &&
-                                     (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1])
-                                        : minus_infinity;
-                    high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
-                                       (data_id + 2) > window_stride) &&
-                                      (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2])
-                                         : minus_infinity;
-                    high_data[i].y = minus_infinity;
-                    if (mask && recompute) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                    }
-                }
-                // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
-                max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-                max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-                max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-                max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
-            } else {
-                low_data[i].x = minus_infinity;
-                low_data[i].y = minus_infinity;
-                high_data[i].x = minus_infinity;
-                high_data[i].y = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            low_data[i].x = __expf(low_data[i].x - max_val);
-            low_data[i].y = __expf(low_data[i].y - max_val);
-            high_data[i].x = __expf(high_data[i].x - max_val);
-            high_data[i].y = __expf(high_data[i].y - max_val);
-
-            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = low_data[i].x / sum;
-                    vals[data_id + 1] = low_data[i].y / sum;
-                    vals[data_id + 2] = high_data[i].x / sum;
-                    vals[data_id + 3] = high_data[i].y / sum;
-                } else {
-                    vals[data_id] = low_data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = low_data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = high_data[i].x / sum;
-                }
-            }
-        }
-    }
-#endif
-}
-
-__global__ void attn_softmax_v2(float* vals,
-                                float* attn_mask,
-                                bool triangular,
-                                bool recompute,
-                                bool local_attention,
-                                int window_size,
-                                int total_count,
-                                int heads,
-                                int sequence_length,
-                                int num_seq,
-                                float scale,
-                                int iterations,
-                                int reduceWidth)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    float4 data[MAX_REG_SIZE];
-
-    int wid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int reduce_blocks = reduceWidth >> 5;
-    int seq_lane = threadIdx.x % reduceWidth;
-
-    __shared__ float partialSum[MAX_WARP_NUM];
-
-    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
-    if (iter_offset < total_count) {
-        vals += (iter_offset * sequence_length);
-
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
-        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
-
-        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
-        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
-                                 ? (real_seq_id >> 2) - (window_size >> 2)
-                                 : 0;
-        int window_stride =
-            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
-
-        float max_val = minus_infinity;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
-                    data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride)
-                                    ? vals[data_id + 1]
-                                    : minus_infinity;
-                    data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride)
-                                    ? vals[data_id + 2]
-                                    : minus_infinity;
-                    data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                 (data_id + 3) > window_stride)
-                                    ? vals[data_id + 3]
-                                    : minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        data[i].y += attn_mask[data_id + mask_offset + 1];
-                        data[i].z += attn_mask[data_id + mask_offset + 2];
-                        data[i].w += attn_mask[data_id + mask_offset + 3];
-                    }
-                } else {
-                    data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
-                    data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride && (data_id + 1) < sequence_length)
-                                    ? (vals[data_id + 1])
-                                    : minus_infinity;
-                    data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride && (data_id + 2) < sequence_length)
-                                    ? (vals[data_id + 2])
-                                    : minus_infinity;
-                    data[i].w = minus_infinity;
-                    if (attn_mask && recompute) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        if ((data_id + 1) < sequence_length)
-                            data[i].y += attn_mask[data_id + mask_offset + 1];
-                        if ((data_id + 2) < sequence_length)
-                            data[i].z += attn_mask[data_id + mask_offset + 2];
-                    }
-                }
-                max_val = (data[i].x > max_val ? data[i].x : max_val);
-                max_val = (data[i].y > max_val ? data[i].y : max_val);
-                max_val = (data[i].z > max_val ? data[i].z : max_val);
-                max_val = (data[i].w > max_val ? data[i].w : max_val);
-            } else {
-                data[i].x = minus_infinity;
-                data[i].y = minus_infinity;
-                data[i].z = minus_infinity;
-                data[i].w = minus_infinity;
-            }
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) {
-            auto temp = g.shfl_xor(max_val, i);
-            max_val = (temp > max_val ? temp : max_val);
-        }
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = max_val;
-            b.sync();
-
-            if (lane < warp_num) max_val = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) {
-                auto temp = g.shfl_xor(max_val, i);
-                max_val = (temp > max_val ? temp : max_val);
-            }
-
-            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
-        }
-
-        float sum = 0;
-        for (int i = 0; i < iterations; i++) {
-            data[i].x = __expf(data[i].x - max_val);
-            data[i].y = __expf(data[i].y - max_val);
-            data[i].z = __expf(data[i].z - max_val);
-            data[i].w = __expf(data[i].w - max_val);
-
-            sum += (data[i].x + data[i].y + data[i].z + data[i].w);
-        }
-
-        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
-
-        if (reduceWidth > WARP_SIZE) {
-            if (lane == 0) partialSum[wid] = sum;
-            b.sync();
-
-            if (lane < warp_num) sum = partialSum[lane];
-
-            b.sync();
-
-            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
-
-            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
-        }
-        sum += 1e-6;
-
-        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
-            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = data[i].x / sum;
-                    vals[data_id + 1] = data[i].y / sum;
-                    vals[data_id + 2] = data[i].z / sum;
-                    vals[data_id + 3] = data[i].w / sum;
-                } else {
-                    vals[data_id] = data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
-                }
-            }
-        }
-    }
-}
-
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            cudaStream_t stream)
-{
-    int total_count = batch_size * heads * num_seq;
-    dim3 grid_dim((total_count - 1) / (WARP_SIZE / ((sequence_length - 1) / ATTN_THREADS + 1)) + 1);
-    dim3 block_dim(ATTN_THREADS);
-
-    const int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1) * WARP_SIZE;
-    const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
-
-    if (sequence_length <= 32768)
-        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(
-            vals,
-            mask,
-            triangular,
-            recompute,
-            local_attention,
-            window_size,
-            total_count,
-            (triangular ? (heads * batch_size) : heads),
-            sequence_length,
-            num_seq,
-            scale,
-            iterations,
-            reduce_width);
-    else
-        throw std::runtime_error("Unsupport Seq_Length!");
-}
-
-template void launch_attn_softmax_v2(float* vals,
-                                     float* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     cudaStream_t stream);
-template void launch_attn_softmax_v2(__half* vals,
-                                     __half* mask,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     float scale,
-                                     cudaStream_t stream);
+#include <limits>
+#include "custom_cuda_layers.h"
+
+#include <cuda_profiler_api.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+
+#define ATTN_THREADS 1024
+#define MAX_REG_SIZE 8
+
+#define minus_infinity -10000.0
+
+void CheckCudaErrorAux(const char* file, unsigned line)
+{
+    cudaError_t err = cudaGetLastError();
+    if (err == cudaSuccess) return;
+    std::cerr << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line
+              << std::endl;
+    throw std::runtime_error("CUDA ERROR!!!\n");
+}
+
+#define CUDA_CHECK_ERROR() CheckCudaErrorAux(__FILE__, __LINE__)
+
+namespace cg = cooperative_groups;
+
+__global__ void attn_softmax_v2(__half* vals,
+                                __half* mask,
+                                bool triangular,
+                                bool recompute,
+                                bool local_attention,
+                                int window_size,
+                                int total_count,
+                                int heads,
+                                int sequence_length,
+                                int num_seq,
+                                float scale,
+                                int iterations,
+                                int reduceWidth)
+{
+#if __CUDA_ARCH__ >= 700
+
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    float2 low_data[MAX_REG_SIZE];
+    float2 high_data[MAX_REG_SIZE];
+
+    __half2 h_scale = __float2half2_rn(scale);
+
+    int wid = threadIdx.x >> 5;
+    int lane = threadIdx.x & 0x1f;
+    int warp_num = blockDim.x >> 5;
+
+    int reduce_blocks = reduceWidth >> 5;
+    int seq_lane = threadIdx.x % reduceWidth;
+
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
+
+    if (iter_offset < total_count) {
+        vals += (iter_offset * sequence_length);
+
+        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
+        int seq_id = iter_offset % num_seq;
+        int seq_id4 = seq_id >> 2;
+
+        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
+        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
+                                 ? (real_seq_id >> 2) - (window_size >> 2)
+                                 : 0;
+        int window_stride =
+            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
+
+        float max_val = minus_infinity;
+
+        for (int i = 0; i < iterations; i++) {
+            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
+            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
+                data_id < sequence_length) {
+                if ((sequence_length - data_id) >= 4) {
+                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
+                                                            : minus_infinity;
+                    low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
+                                     (data_id + 1) > window_stride)
+                                        ? __half2float(vals[data_id + 1])
+                                        : minus_infinity;
+                    high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
+                                      (data_id + 2) > window_stride)
+                                         ? __half2float(vals[data_id + 2])
+                                         : minus_infinity;
+                    high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
+                                      (data_id + 3) > window_stride)
+                                         ? __half2float(vals[data_id + 3])
+                                         : minus_infinity;
+                    if (mask && recompute) {
+                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
+                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
+                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
+                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
+                    }
+                } else {
+                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
+                                                            : minus_infinity;
+                    low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
+                                      (data_id + 1) > window_stride) &&
+                                     (data_id + 1) < sequence_length)
+                                        ? __half2float(vals[data_id + 1])
+                                        : minus_infinity;
+                    high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
+                                       (data_id + 2) > window_stride) &&
+                                      (data_id + 2) < sequence_length)
+                                         ? __half2float(vals[data_id + 2])
+                                         : minus_infinity;
+                    high_data[i].y = minus_infinity;
+                    if (mask && recompute) {
+                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
+                        if ((data_id + 1) < sequence_length)
+                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
+                        if ((data_id + 2) < sequence_length)
+                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
+                    }
+                }
+                // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
+                max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
+                max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
+                max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
+                max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
+            } else {
+                low_data[i].x = minus_infinity;
+                low_data[i].y = minus_infinity;
+                high_data[i].x = minus_infinity;
+                high_data[i].y = minus_infinity;
+            }
+        }
+
+        for (int i = 1; i < WARP_SIZE; i *= 2) {
+            auto temp = g.shfl_xor(max_val, i);
+            max_val = (temp > max_val ? temp : max_val);
+        }
+
+        if (reduceWidth > WARP_SIZE) {
+            if (lane == 0) partialSum[wid] = max_val;
+            b.sync();
+
+            if (lane < warp_num) max_val = partialSum[lane];
+
+            b.sync();
+
+            for (int i = 1; i < reduce_blocks; i *= 2) {
+                auto temp = g.shfl_xor(max_val, i);
+                max_val = (temp > max_val ? temp : max_val);
+            }
+
+            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
+        }
+        float sum = 0;
+        for (int i = 0; i < iterations; i++) {
+            low_data[i].x = __expf(low_data[i].x - max_val);
+            low_data[i].y = __expf(low_data[i].y - max_val);
+            high_data[i].x = __expf(high_data[i].x - max_val);
+            high_data[i].y = __expf(high_data[i].y - max_val);
+
+            sum += (low_data[i].x + low_data[i].y + high_data[i].x + high_data[i].y);
+        }
+
+        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
+
+        if (reduceWidth > WARP_SIZE) {
+            if (lane == 0) partialSum[wid] = sum;
+            b.sync();
+
+            if (lane < warp_num) sum = partialSum[lane];
+
+            b.sync();
+
+            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
+        }
+        sum += 1e-6;
+        for (int i = 0; i < iterations; i++) {
+            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
+
+            if (data_id < sequence_length) {
+                if ((sequence_length - data_id) >= 4) {
+                    vals[data_id] = low_data[i].x / sum;
+                    vals[data_id + 1] = low_data[i].y / sum;
+                    vals[data_id + 2] = high_data[i].x / sum;
+                    vals[data_id + 3] = high_data[i].y / sum;
+                } else {
+                    vals[data_id] = low_data[i].x / sum;
+                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = low_data[i].y / sum;
+                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = high_data[i].x / sum;
+                }
+            }
+        }
+    }
+#endif
+}
+
+__global__ void attn_softmax_v2(float* vals,
+                                float* attn_mask,
+                                bool triangular,
+                                bool recompute,
+                                bool local_attention,
+                                int window_size,
+                                int total_count,
+                                int heads,
+                                int sequence_length,
+                                int num_seq,
+                                float scale,
+                                int iterations,
+                                int reduceWidth)
+{
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+    float4 data[MAX_REG_SIZE];
+
+    int wid = threadIdx.x >> 5;
+    int lane = threadIdx.x & 0x1f;
+    int warp_num = blockDim.x >> 5;
+
+    int reduce_blocks = reduceWidth >> 5;
+    int seq_lane = threadIdx.x % reduceWidth;
+
+    __shared__ float partialSum[MAX_WARP_NUM];
+
+    int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
+    if (iter_offset < total_count) {
+        vals += (iter_offset * sequence_length);
+
+        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
+        int seq_id = iter_offset % num_seq;
+        int seq_id4 = seq_id >> 2;
+
+        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
+        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
+                                 ? (real_seq_id >> 2) - (window_size >> 2)
+                                 : 0;
+        int window_stride =
+            (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
+
+        float max_val = minus_infinity;
+
+        for (int i = 0; i < iterations; i++) {
+            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
+            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
+                data_id < sequence_length) {
+                if ((sequence_length - data_id) >= 4) {
+                    data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
+                    data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
+                                 (data_id + 1) > window_stride)
+                                    ? vals[data_id + 1]
+                                    : minus_infinity;
+                    data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
+                                 (data_id + 2) > window_stride)
+                                    ? vals[data_id + 2]
+                                    : minus_infinity;
+                    data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
+                                 (data_id + 3) > window_stride)
+                                    ? vals[data_id + 3]
+                                    : minus_infinity;
+                    if (attn_mask && recompute) {
+                        data[i].x += attn_mask[data_id + mask_offset];
+                        data[i].y += attn_mask[data_id + mask_offset + 1];
+                        data[i].z += attn_mask[data_id + mask_offset + 2];
+                        data[i].w += attn_mask[data_id + mask_offset + 3];
+                    }
+                } else {
+                    data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
+                    data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
+                                 (data_id + 1) > window_stride && (data_id + 1) < sequence_length)
+                                    ? (vals[data_id + 1])
+                                    : minus_infinity;
+                    data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
+                                 (data_id + 2) > window_stride && (data_id + 2) < sequence_length)
+                                    ? (vals[data_id + 2])
+                                    : minus_infinity;
+                    data[i].w = minus_infinity;
+                    if (attn_mask && recompute) {
+                        data[i].x += attn_mask[data_id + mask_offset];
+                        if ((data_id + 1) < sequence_length)
+                            data[i].y += attn_mask[data_id + mask_offset + 1];
+                        if ((data_id + 2) < sequence_length)
+                            data[i].z += attn_mask[data_id + mask_offset + 2];
+                    }
+                }
+                max_val = (data[i].x > max_val ? data[i].x : max_val);
+                max_val = (data[i].y > max_val ? data[i].y : max_val);
+                max_val = (data[i].z > max_val ? data[i].z : max_val);
+                max_val = (data[i].w > max_val ? data[i].w : max_val);
+            } else {
+                data[i].x = minus_infinity;
+                data[i].y = minus_infinity;
+                data[i].z = minus_infinity;
+                data[i].w = minus_infinity;
+            }
+        }
+
+        for (int i = 1; i < WARP_SIZE; i *= 2) {
+            auto temp = g.shfl_xor(max_val, i);
+            max_val = (temp > max_val ? temp : max_val);
+        }
+
+        if (reduceWidth > WARP_SIZE) {
+            if (lane == 0) partialSum[wid] = max_val;
+            b.sync();
+
+            if (lane < warp_num) max_val = partialSum[lane];
+
+            b.sync();
+
+            for (int i = 1; i < reduce_blocks; i *= 2) {
+                auto temp = g.shfl_xor(max_val, i);
+                max_val = (temp > max_val ? temp : max_val);
+            }
+
+            max_val = g.shfl(max_val, threadIdx.x / WARP_SIZE);
+        }
+
+        float sum = 0;
+        for (int i = 0; i < iterations; i++) {
+            data[i].x = __expf(data[i].x - max_val);
+            data[i].y = __expf(data[i].y - max_val);
+            data[i].z = __expf(data[i].z - max_val);
+            data[i].w = __expf(data[i].w - max_val);
+
+            sum += (data[i].x + data[i].y + data[i].z + data[i].w);
+        }
+
+        for (int i = 1; i < WARP_SIZE; i *= 2) sum += g.shfl_xor(sum, i);
+
+        if (reduceWidth > WARP_SIZE) {
+            if (lane == 0) partialSum[wid] = sum;
+            b.sync();
+
+            if (lane < warp_num) sum = partialSum[lane];
+
+            b.sync();
+
+            for (int i = 1; i < reduce_blocks; i *= 2) { sum += g.shfl_xor(sum, i); }
+
+            sum = g.shfl(sum, threadIdx.x / WARP_SIZE);
+        }
+        sum += 1e-6;
+
+        for (int i = 0; i < iterations; i++) {
+            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
+
+            if (data_id < sequence_length) {
+                if ((sequence_length - data_id) >= 4) {
+                    vals[data_id] = data[i].x / sum;
+                    vals[data_id + 1] = data[i].y / sum;
+                    vals[data_id + 2] = data[i].z / sum;
+                    vals[data_id + 3] = data[i].w / sum;
+                } else {
+                    vals[data_id] = data[i].x / sum;
+                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
+                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void launch_attn_softmax_v2(T* vals,
+                            T* mask,
+                            bool triangular,
+                            bool recompute,
+                            bool local_attention,
+                            int window_size,
+                            int batch_size,
+                            int heads,
+                            int num_seq,
+                            int sequence_length,
+                            float scale,
+                            cudaStream_t stream)
+{
+    int total_count = batch_size * heads * num_seq;
+    dim3 grid_dim((total_count - 1) / (WARP_SIZE / ((sequence_length - 1) / ATTN_THREADS + 1)) + 1);
+    dim3 block_dim(ATTN_THREADS);
+
+    const int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1) * WARP_SIZE;
+    const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
+
+    if (sequence_length <= 32768)
+        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(
+            vals,
+            mask,
+            triangular,
+            recompute,
+            local_attention,
+            window_size,
+            total_count,
+            (triangular ? (heads * batch_size) : heads),
+            sequence_length,
+            num_seq,
+            scale,
+            iterations,
+            reduce_width);
+    else
+        throw std::runtime_error("Unsupport Seq_Length!");
+}
+
+template void launch_attn_softmax_v2(float* vals,
+                                     float* mask,
+                                     bool triangular,
+                                     bool recompute,
+                                     bool local_attention,
+                                     int window_size,
+                                     int batch_size,
+                                     int heads,
+                                     int num_seq,
+                                     int sequence_length,
+                                     float scale,
+                                     cudaStream_t stream);
+template void launch_attn_softmax_v2(__half* vals,
+                                     __half* mask,
+                                     bool triangular,
+                                     bool recompute,
+                                     bool local_attention,
+                                     int window_size,
+                                     int batch_size,
+                                     int heads,
+                                     int num_seq,
+                                     int sequence_length,
+                                     float scale,
+                                     cudaStream_t stream);
--- a/csrc/transformer/inference/includes/context.h
+++ b/csrc/transformer/inference/includes/context.h
@ -1,112 +1,112 @@
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-
-#define WARP_SIZE 32
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        cudaError_t error_code = callstr;                                                      \
-        if (error_code != cudaSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-
-inline int DS_GET_BLOCKS(const int N)
-{
-    return std::max(
-        std::min((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
-    {
-        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
-        curandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-        cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-    }
-
-    virtual ~Context()
-    {
-        cublasDestroy(_cublasHandle);
-        cudaFree(_workspace);
-    }
-
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-
-    void GenWorkSpace(size_t size)
-    {
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, size);
-        } else if (_workSpaceSize < size) {
-            cudaFree(_workspace);
-            cudaMalloc(&_workspace, size);
-        }
-
-        _workSpaceSize = size;
-    }
-
-    void* GetWorkSpace() { return _workspace; }
-
-    curandGenerator_t& GetRandGenerator() { return _gen; }
-
-    cudaStream_t GetCurrentStream()
-    {
-        // get current pytorch stream.
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        return stream;
-    }
-
-    cublasHandle_t GetCublasHandle() { return _cublasHandle; }
-
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-
-private:
-    curandGenerator_t _gen;
-    cublasHandle_t _cublasHandle;
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    size_t _workSpaceSize;
-    std::vector<std::array<int, 3>> _gemm_algos;
-};
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime_api.h>
+#include <cassert>
+#include <iostream>
+#include <vector>
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+
+#define WARP_SIZE 32
+
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        cudaError_t error_code = callstr;                                                      \
+        if (error_code != cudaSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
+        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
+
+#define DS_CUDA_NUM_THREADS 512
+#define DS_MAXIMUM_NUM_BLOCKS 262144
+
+inline int DS_GET_BLOCKS(const int N)
+{
+    return std::max(
+        std::min((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
+        // Use at least 1 block, since CUDA does not allow empty block
+        1);
+}
+
+class Context {
+public:
+    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
+    {
+        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
+        curandSetPseudoRandomGeneratorSeed(_gen, 123);
+        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
+            auto message = std::string("Fail to create cublas handle.");
+            std::cerr << message << std::endl;
+            throw std::runtime_error(message);
+        }
+        cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
+    }
+
+    virtual ~Context()
+    {
+        cublasDestroy(_cublasHandle);
+        cudaFree(_workspace);
+    }
+
+    static Context& Instance()
+    {
+        static Context _ctx;
+        return _ctx;
+    }
+
+    void GenWorkSpace(size_t size)
+    {
+        if (!_workspace) {
+            assert(_workspace == nullptr);
+            cudaMalloc(&_workspace, size);
+        } else if (_workSpaceSize < size) {
+            cudaFree(_workspace);
+            cudaMalloc(&_workspace, size);
+        }
+
+        _workSpaceSize = size;
+    }
+
+    void* GetWorkSpace() { return _workspace; }
+
+    curandGenerator_t& GetRandGenerator() { return _gen; }
+
+    cudaStream_t GetCurrentStream()
+    {
+        // get current pytorch stream.
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+        return stream;
+    }
+
+    cublasHandle_t GetCublasHandle() { return _cublasHandle; }
+
+    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
+    {
+        uint64_t offset = _curr_offset;
+        _curr_offset += offset_inc;
+        return std::pair<uint64_t, uint64_t>(_seed, offset);
+    }
+
+    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
+
+    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
+
+private:
+    curandGenerator_t _gen;
+    cublasHandle_t _cublasHandle;
+    void* _workspace;
+    uint64_t _seed;
+    uint64_t _curr_offset;
+    size_t _workSpaceSize;
+    std::vector<std::array<int, 3>> _gemm_algos;
+};
--- a/csrc/transformer/inference/includes/cublas_wrappers.h
+++ b/csrc/transformer/inference/includes/cublas_wrappers.h
@ -1,208 +1,208 @@
-#pragma once
-
-#include <assert.h>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <mma.h>
-#include <stdio.h>
-#include "cublas_wrappers.h"
-
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   cublasGemmAlgo_t algo)
-{
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_32F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_32F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         C,
-                                         CUDA_R_32F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
-{
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_16F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_16F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         (void*)C,
-                                         CUDA_R_16F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-{
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_32F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_32F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_32F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
-                batch,
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-{
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_16F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_16F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_16F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-
-    return 0;
-}
+#pragma once
+
+#include <assert.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <mma.h>
+#include <stdio.h>
+#include "cublas_wrappers.h"
+
+int cublas_gemm_ex(cublasHandle_t handle,
+                   cublasOperation_t transa,
+                   cublasOperation_t transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const float* A,
+                   const float* B,
+                   float* C,
+                   cublasGemmAlgo_t algo)
+{
+    cublasStatus_t status = cublasGemmEx(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         CUDA_R_32F,
+                                         (transa == CUBLAS_OP_N) ? m : k,
+                                         (const void*)B,
+                                         CUDA_R_32F,
+                                         (transb == CUBLAS_OP_N) ? k : n,
+                                         (const void*)beta,
+                                         C,
+                                         CUDA_R_32F,
+                                         m,
+                                         CUDA_R_32F,
+                                         algo);
+
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int cublas_gemm_ex(cublasHandle_t handle,
+                   cublasOperation_t transa,
+                   cublasOperation_t transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const __half* A,
+                   const __half* B,
+                   __half* C,
+                   cublasGemmAlgo_t algo)
+{
+    cublasStatus_t status = cublasGemmEx(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         CUDA_R_16F,
+                                         (transa == CUBLAS_OP_N) ? m : k,
+                                         (const void*)B,
+                                         CUDA_R_16F,
+                                         (transb == CUBLAS_OP_N) ? k : n,
+                                         (const void*)beta,
+                                         (void*)C,
+                                         CUDA_R_16F,
+                                         m,
+                                         CUDA_R_32F,
+                                         algo);
+
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int cublas_strided_batched_gemm(cublasHandle_t handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const float* A,
+                                const float* B,
+                                float* C,
+                                cublasOperation_t op_A,
+                                cublasOperation_t op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                cublasGemmAlgo_t algo)
+{
+    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
+                                                       op_A,
+                                                       op_B,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       alpha,
+                                                       A,
+                                                       CUDA_R_32F,
+                                                       (op_A == CUBLAS_OP_N) ? m : k,
+                                                       stride_A,
+                                                       B,
+                                                       CUDA_R_32F,
+                                                       (op_B == CUBLAS_OP_N) ? k : n,
+                                                       stride_B,
+                                                       beta,
+                                                       C,
+                                                       CUDA_R_32F,
+                                                       m,
+                                                       stride_C,
+                                                       batch,
+                                                       CUDA_R_32F,
+                                                       algo);
+
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
+                batch,
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int cublas_strided_batched_gemm(cublasHandle_t handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const __half* A,
+                                const __half* B,
+                                __half* C,
+                                cublasOperation_t op_A,
+                                cublasOperation_t op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                cublasGemmAlgo_t algo)
+{
+    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
+                                                       op_A,
+                                                       op_B,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       alpha,
+                                                       A,
+                                                       CUDA_R_16F,
+                                                       (op_A == CUBLAS_OP_N) ? m : k,
+                                                       stride_A,
+                                                       B,
+                                                       CUDA_R_16F,
+                                                       (op_B == CUBLAS_OP_N) ? k : n,
+                                                       stride_B,
+                                                       beta,
+                                                       C,
+                                                       CUDA_R_16F,
+                                                       m,
+                                                       stride_C,
+                                                       batch,
+                                                       CUDA_R_32F,
+                                                       algo);
+
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+
+    return 0;
+}
--- a/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer/inference/includes/custom_cuda_layers.h
@ -1,79 +1,79 @@
-#pragma once
-
-#include <cooperative_groups.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>
-
-#define MAX_WARP_NUM 32
-#define WARP_SIZE 32
-#define SMs 80
-
-#define MAX_REGISTERS 256
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            cudaStream_t stream);
-
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream);
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          const T* residual,
-                          const T* bias,
-                          int size,
-                          int intermediate_size,
-                          cudaStream_t stream);
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       cudaStream_t stream);
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                cudaStream_t stream);
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       cudaStream_t stream);
+#pragma once
+
+#include <cooperative_groups.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cassert>
+#include <iostream>
+
+#define MAX_WARP_NUM 32
+#define WARP_SIZE 32
+#define SMs 80
+
+#define MAX_REGISTERS 256
+template <typename T>
+void launch_attn_softmax_v2(T* vals,
+                            T* mask,
+                            bool triangular,
+                            bool recompute,
+                            bool local_attention,
+                            int window_size,
+                            int batch_size,
+                            int heads,
+                            int num_seq,
+                            int sequence_length,
+                            float scale,
+                            cudaStream_t stream);
+
+// Fused bias add with gelu activation
+template <typename T>
+void launch_bias_gelu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      cudaStream_t stream);
+template <typename T>
+void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
+
+template <typename T>
+void launch_bias_residual(T* input,
+                          const T* residual,
+                          const T* bias,
+                          int size,
+                          int intermediate_size,
+                          cudaStream_t stream);
+
+template <typename T>
+void launch_layer_norm(T* out,
+                       T* vals,
+                       const T* gamma,
+                       const T* beta,
+                       float epsilon,
+                       int batch_size,
+                       int hidden_dim,
+                       cudaStream_t stream);
+
+template <typename T>
+void launch_residual_layer_norm(T* norm,
+                                T* res_add,
+                                T* vals,
+                                T* residual,
+                                const T* bias,
+                                const T* gamma,
+                                const T* beta,
+                                float epsilon,
+                                int batch_size,
+                                int hidden_dim,
+                                bool preLN,
+                                cudaStream_t stream);
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       unsigned merge_count,
+                       cudaStream_t stream);
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
--- a/deepspeed/launcher/constants.py
+++ b/deepspeed/launcher/constants.py
@ -1,9 +1,9 @@
-# Copyright 2020 The Microsoft DeepSpeed Team
-
-PDSH_LAUNCHER = 'pdsh'
-PDSH_MAX_FAN_OUT = 1024
-
-OPENMPI_LAUNCHER = 'openmpi'
-
-MVAPICH_LAUNCHER = 'mvapich'
-MVAPICH_TMP_HOSTFILE = '/tmp/deepspeed_mvapich_hostfile'
+# Copyright 2020 The Microsoft DeepSpeed Team
+
+PDSH_LAUNCHER = 'pdsh'
+PDSH_MAX_FAN_OUT = 1024
+
+OPENMPI_LAUNCHER = 'openmpi'
+
+MVAPICH_LAUNCHER = 'mvapich'
+MVAPICH_TMP_HOSTFILE = '/tmp/deepspeed_mvapich_hostfile'
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@ -1,227 +1,227 @@
-import os
-import sys
-import shutil
-import subprocess
-import warnings
-from abc import ABC, abstractmethod
-
-from ..utils import logger
-from .constants import PDSH_MAX_FAN_OUT, MVAPICH_TMP_HOSTFILE
-
-
-class MultiNodeRunner(ABC):
-    def __init__(self, args, world_info_base64):
-        self.args = args
-        self.validate_args()
-        self.user_arguments = self.parse_user_args()
-        self.user_script = args.user_script
-        self.world_info_base64 = world_info_base64
-        self.exports = {}
-
-    @abstractmethod
-    def backend_exists(self):
-        """Return whether the corresponding backend exists"""
-
-    @abstractmethod
-    def get_cmd(self, environment, active_resources):
-        """Return the command to execute on node"""
-
-    def add_export(self, key, var):
-        self.exports[key.strip()] = var.strip()
-
-    def parse_user_args(self):
-        return self.args.user_args
-
-    @property
-    def name(self):
-        """Return the name of the backend"""
-        return self.__class__.__name__
-
-    def validate_args(self):
-        """Validate self.args"""
-
-
-class PDSHRunner(MultiNodeRunner):
-    def __init__(self, args, world_info_base64):
-        super().__init__(args, world_info_base64)
-
-    def backend_exists(self):
-        return shutil.which('pdsh')
-
-    @property
-    def name(self):
-        return "pdsh"
-
-    def parse_user_args(self):
-        return list(
-            map(lambda x: x if x.startswith("-") else f"'{x}'",
-                self.args.user_args))
-
-    def get_cmd(self, environment, active_resources):
-        environment['PDSH_RCMD_TYPE'] = 'ssh'
-
-        active_workers = ",".join(active_resources.keys())
-        logger.info("Running on the following workers: %s" % active_workers)
-
-        # PDSH flags for max node fan out and specific hosts to launch on
-        # See https://linux.die.net/man/1/pdsh for flag details
-        pdsh_cmd_args = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers]
-
-        exports = ""
-        for key, val in self.exports.items():
-            exports += f"export {key}={val}; "
-
-        # https://linux.die.net/man/1/pdsh
-        # %n will be replaced by pdsh command
-        deepspeed_launch = [
-            exports,
-            f"cd {os.path.abspath('.')};",
-            sys.executable,
-            "-u",
-            "-m",
-            "deepspeed.launcher.launch",
-            f'--world_info={self.world_info_base64}',
-            "--node_rank=%n",
-            f"--master_addr={self.args.master_addr}",
-            f"--master_port={self.args.master_port}"
-        ]
-
-        return pdsh_cmd_args + deepspeed_launch + [self.user_script
-                                                   ] + self.user_arguments
-
-
-class OpenMPIRunner(MultiNodeRunner):
-    def __init__(self, args, world_info_base64, resource_pool):
-        super().__init__(args, world_info_base64)
-        self.resource_pool = resource_pool
-        self.add_export('UCX_TLS', 'tcp')
-
-    def backend_exists(self):
-        #TODO: if IB is available we should suggestion mvapich
-        return shutil.which('ompi_info')
-
-    @property
-    def name(self):
-        return "openmpi"
-
-    def validate_args(self):
-        super().validate_args()
-        #TODO: Allow for include/exclude at node-level but not gpu-level
-        if self.args.include != "" or self.args.exclude != "":
-            raise ValueError(
-                f"{self.name} backend does not support worker include/exclusion")
-        if self.args.num_nodes != -1 or self.args.num_gpus != -1:
-            raise ValueError(
-                f"{self.name} backend does not support limiting num nodes/gpus")
-
-    def get_cmd(self, environment, active_resources):
-        total_process_count = sum(self.resource_pool.values())
-
-        mpirun_cmd = [
-            'mpirun',
-            '-n',
-            f'{total_process_count}',
-            '-hostfile',
-            f'{self.args.hostfile}',
-            '--mca',
-            'btl',
-            '^openib',
-            '--mca',
-            'btl_tcp_if_include',
-            'eth0',
-        ]
-
-        export_cmd = []
-        for k, v in self.exports.items():
-            export_cmd += ['-x', f'{k}={v}']
-
-        python_exec = [sys.executable, "-u"]
-
-        return mpirun_cmd + export_cmd + python_exec + [self.user_script
-                                                        ] + self.user_arguments
-
-
-class MVAPICHRunner(MultiNodeRunner):
-    def __init__(self, args, world_info_base64, resource_pool):
-        super().__init__(args, world_info_base64)
-        self.resource_pool = resource_pool
-
-        # Disable the CMA kernel module, not available on Ubuntu systems
-        self.add_export('MV2_SMP_USE_CMA', '0')
-
-        # If we fail this will output more verbose logging
-        self.add_export('MV2_DEBUG_SHOW_BACKTRACE', '1')
-
-        # Enabled cuda-aware communication
-        self.add_export('MV2_USE_CUDA', '1')
-
-        # Support deep learning frameworks: http://hidl.cse.ohio-state.edu/userguide/horovod/
-        self.add_export('MV2_SUPPORT_DL', '1')
-
-        # Support MPI_THREAD_MULTIPLE
-        self.add_export('MV2_ENABLE_AFFINITY', '0')
-
-        # Performance tuning flags for allgather
-        self.add_export('MV2_INTER_ALLGATHER_TUNING', '5')
-        self.add_export('MV2_CUDA_USE_NAIVE', '0')
-
-    def backend_exists(self):
-        #TODO: if IB is available we should suggestion mvapich
-        mpiname_exists = shutil.which('mpiname')
-        exists = False
-        if not mpiname_exists:
-            warnings.warn("mpiname does not exist, mvapich is not installed properly")
-        else:
-            results = subprocess.check_output('mpiname', shell=True)
-            mpiname_results = results.decode('utf-8').strip()
-            if "MVAPICH2-GDR" in mpiname_results:
-                exists = True
-            else:
-                warnings.warn(
-                    f"Expected MVAPICH2-GDR as return for mpiname but received {mpiname_results}"
-                )
-        return exists
-
-    @property
-    def name(self):
-        return "mvapich"
-
-    def validate_args(self):
-        super().validate_args()
-        #TODO: Allow for include/exclude at node-level but not gpu-level
-        if self.args.include != "" or self.args.exclude != "":
-            raise ValueError(
-                f"{self.name} backend does not support worker include/exclusion")
-        if self.args.num_nodes != -1 or self.args.num_gpus != -1:
-            raise ValueError(
-                f"{self.name} backend does not support limiting num nodes/gpus")
-
-    def get_cmd(self, environment, active_resources):
-        devices_per_node = self.resource_pool.values()
-        total_process_count = sum(devices_per_node)
-        process_per_node = list(devices_per_node)[0]
-        if not all([n == process_per_node for n in devices_per_node]):
-            raise ValueError("mvapich requires same number of devices per node")
-
-        with open(MVAPICH_TMP_HOSTFILE, 'w') as fd:
-            for host in self.resource_pool.keys():
-                fd.write(f'{host}\n')
-
-        mpirun_cmd = [
-            'mpirun',
-            '-np',
-            f'{total_process_count}',
-            '-ppn',
-            f'{process_per_node}',
-            '--hostfile',
-            f'{MVAPICH_TMP_HOSTFILE}',
-        ]
-
-        export_cmd = []
-        for k, v in self.exports.items():
-            export_cmd += ['-env', f'{k}={v}']
-
-        python_exec = [sys.executable, "-u"]
-
-        return mpirun_cmd + export_cmd + python_exec + [self.user_script
-                                                        ] + self.user_arguments
+import os
+import sys
+import shutil
+import subprocess
+import warnings
+from abc import ABC, abstractmethod
+
+from ..utils import logger
+from .constants import PDSH_MAX_FAN_OUT, MVAPICH_TMP_HOSTFILE
+
+
+class MultiNodeRunner(ABC):
+    def __init__(self, args, world_info_base64):
+        self.args = args
+        self.validate_args()
+        self.user_arguments = self.parse_user_args()
+        self.user_script = args.user_script
+        self.world_info_base64 = world_info_base64
+        self.exports = {}
+
+    @abstractmethod
+    def backend_exists(self):
+        """Return whether the corresponding backend exists"""
+
+    @abstractmethod
+    def get_cmd(self, environment, active_resources):
+        """Return the command to execute on node"""
+
+    def add_export(self, key, var):
+        self.exports[key.strip()] = var.strip()
+
+    def parse_user_args(self):
+        return self.args.user_args
+
+    @property
+    def name(self):
+        """Return the name of the backend"""
+        return self.__class__.__name__
+
+    def validate_args(self):
+        """Validate self.args"""
+
+
+class PDSHRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64):
+        super().__init__(args, world_info_base64)
+
+    def backend_exists(self):
+        return shutil.which('pdsh')
+
+    @property
+    def name(self):
+        return "pdsh"
+
+    def parse_user_args(self):
+        return list(
+            map(lambda x: x if x.startswith("-") else f"'{x}'",
+                self.args.user_args))
+
+    def get_cmd(self, environment, active_resources):
+        environment['PDSH_RCMD_TYPE'] = 'ssh'
+
+        active_workers = ",".join(active_resources.keys())
+        logger.info("Running on the following workers: %s" % active_workers)
+
+        # PDSH flags for max node fan out and specific hosts to launch on
+        # See https://linux.die.net/man/1/pdsh for flag details
+        pdsh_cmd_args = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers]
+
+        exports = ""
+        for key, val in self.exports.items():
+            exports += f"export {key}={val}; "
+
+        # https://linux.die.net/man/1/pdsh
+        # %n will be replaced by pdsh command
+        deepspeed_launch = [
+            exports,
+            f"cd {os.path.abspath('.')};",
+            sys.executable,
+            "-u",
+            "-m",
+            "deepspeed.launcher.launch",
+            f'--world_info={self.world_info_base64}',
+            "--node_rank=%n",
+            f"--master_addr={self.args.master_addr}",
+            f"--master_port={self.args.master_port}"
+        ]
+
+        return pdsh_cmd_args + deepspeed_launch + [self.user_script
+                                                   ] + self.user_arguments
+
+
+class OpenMPIRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64, resource_pool):
+        super().__init__(args, world_info_base64)
+        self.resource_pool = resource_pool
+        self.add_export('UCX_TLS', 'tcp')
+
+    def backend_exists(self):
+        #TODO: if IB is available we should suggestion mvapich
+        return shutil.which('ompi_info')
+
+    @property
+    def name(self):
+        return "openmpi"
+
+    def validate_args(self):
+        super().validate_args()
+        #TODO: Allow for include/exclude at node-level but not gpu-level
+        if self.args.include != "" or self.args.exclude != "":
+            raise ValueError(
+                f"{self.name} backend does not support worker include/exclusion")
+        if self.args.num_nodes != -1 or self.args.num_gpus != -1:
+            raise ValueError(
+                f"{self.name} backend does not support limiting num nodes/gpus")
+
+    def get_cmd(self, environment, active_resources):
+        total_process_count = sum(self.resource_pool.values())
+
+        mpirun_cmd = [
+            'mpirun',
+            '-n',
+            f'{total_process_count}',
+            '-hostfile',
+            f'{self.args.hostfile}',
+            '--mca',
+            'btl',
+            '^openib',
+            '--mca',
+            'btl_tcp_if_include',
+            'eth0',
+        ]
+
+        export_cmd = []
+        for k, v in self.exports.items():
+            export_cmd += ['-x', f'{k}={v}']
+
+        python_exec = [sys.executable, "-u"]
+
+        return mpirun_cmd + export_cmd + python_exec + [self.user_script
+                                                        ] + self.user_arguments
+
+
+class MVAPICHRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64, resource_pool):
+        super().__init__(args, world_info_base64)
+        self.resource_pool = resource_pool
+
+        # Disable the CMA kernel module, not available on Ubuntu systems
+        self.add_export('MV2_SMP_USE_CMA', '0')
+
+        # If we fail this will output more verbose logging
+        self.add_export('MV2_DEBUG_SHOW_BACKTRACE', '1')
+
+        # Enabled cuda-aware communication
+        self.add_export('MV2_USE_CUDA', '1')
+
+        # Support deep learning frameworks: http://hidl.cse.ohio-state.edu/userguide/horovod/
+        self.add_export('MV2_SUPPORT_DL', '1')
+
+        # Support MPI_THREAD_MULTIPLE
+        self.add_export('MV2_ENABLE_AFFINITY', '0')
+
+        # Performance tuning flags for allgather
+        self.add_export('MV2_INTER_ALLGATHER_TUNING', '5')
+        self.add_export('MV2_CUDA_USE_NAIVE', '0')
+
+    def backend_exists(self):
+        #TODO: if IB is available we should suggestion mvapich
+        mpiname_exists = shutil.which('mpiname')
+        exists = False
+        if not mpiname_exists:
+            warnings.warn("mpiname does not exist, mvapich is not installed properly")
+        else:
+            results = subprocess.check_output('mpiname', shell=True)
+            mpiname_results = results.decode('utf-8').strip()
+            if "MVAPICH2-GDR" in mpiname_results:
+                exists = True
+            else:
+                warnings.warn(
+                    f"Expected MVAPICH2-GDR as return for mpiname but received {mpiname_results}"
+                )
+        return exists
+
+    @property
+    def name(self):
+        return "mvapich"
+
+    def validate_args(self):
+        super().validate_args()
+        #TODO: Allow for include/exclude at node-level but not gpu-level
+        if self.args.include != "" or self.args.exclude != "":
+            raise ValueError(
+                f"{self.name} backend does not support worker include/exclusion")
+        if self.args.num_nodes != -1 or self.args.num_gpus != -1:
+            raise ValueError(
+                f"{self.name} backend does not support limiting num nodes/gpus")
+
+    def get_cmd(self, environment, active_resources):
+        devices_per_node = self.resource_pool.values()
+        total_process_count = sum(devices_per_node)
+        process_per_node = list(devices_per_node)[0]
+        if not all([n == process_per_node for n in devices_per_node]):
+            raise ValueError("mvapich requires same number of devices per node")
+
+        with open(MVAPICH_TMP_HOSTFILE, 'w') as fd:
+            for host in self.resource_pool.keys():
+                fd.write(f'{host}\n')
+
+        mpirun_cmd = [
+            'mpirun',
+            '-np',
+            f'{total_process_count}',
+            '-ppn',
+            f'{process_per_node}',
+            '--hostfile',
+            f'{MVAPICH_TMP_HOSTFILE}',
+        ]
+
+        export_cmd = []
+        for k, v in self.exports.items():
+            export_cmd += ['-env', f'{k}={v}']
+
+        python_exec = [sys.executable, "-u"]
+
+        return mpirun_cmd + export_cmd + python_exec + [self.user_script
+                                                        ] + self.user_arguments
--- a/deepspeed/module_inject/module_quantize.py
+++ b/deepspeed/module_inject/module_quantize.py
@ -1,80 +1,80 @@
-import copy
-import torch
-import deepspeed
-
-
-def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=False):
-    """ Quantize bert-style transformer layers with DeepSpeed's transformer layer
-    Arguments:
-        orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
-            e.g., transformers.modeling_bert.BertLayer.
-        model (torch.nn.Module): user's nn.module representing their model
-
-        megatron (bool): megatron model-parallel implementation (this is supported for inference only)
-        preln (bool): does the original layer implementation do pre or post layer norm?
-
-        Note: For Bert kind of models, we inject based on the DeepSpeed-Example models, if not setting huggingface flag.
-
-    Returns:
-        Updated nn.module with quantized transformer layers
-    """
-    def quantize_weight(weight):
-        return weight.to(torch.int8)
-
-    def megatron_layer_quantize(layer):
-        layer.attention.query_key_value.weight.data = quantize_weight(
-            layer.attention.query_key_value.weight.data)
-        layer.attention.dense.weight.data = quantize_weight(
-            layer.attention.dense.weight.data)
-        layer.mlp.dense_h_to_4h.weight.data = quantize_weight(
-            layer.mlp.dense_h_to_4h.weight.data)
-        layer.mlp.dense_4h_to_h.weight.data = quantize_weight(
-            layer.mlp.dense_4h_to_h.weight.data)
-
-    def bert_layer_quantize(layer):
-        layer.attention.self.query.weight.data = quantize_weight(
-            layer.attention.self.query.weight.data)
-        layer.attention.self.key.weight.data = quantize_weight(
-            layer.attention.self.key.weight.data)
-        layer.attention.self.value.weight.data = quantize_weight(
-            layer.attention.self.value.weight.data)
-        layer.attention.output.dense.weight.data = quantize_weight(
-            layer.attention.output.dense.weight.data)
-        if preln:
-            layer.intermediate.dense_act.weight.data = quantize_weight(
-                layer.intermediate.dense_act.weight.data)
-        else:
-            layer.intermediate.dense.weight.data = quantize_weight(
-                layer.intermediate.dense.weight.data)
-        layer.output.dense.weight.data = quantize_weight(layer.output.dense.weight.data)
-
-    def quantize_fn(child):
-        if megatron:
-            # Quantize megatron GPT2 / GPT3 trained model
-            megatron_layer_quantize(child)
-        else:
-            # Quantize either DeepSpeed or HuggingFace trained model
-            bert_layer_quantize(child)
-
-        return child
-
-    return quantize_module(model=model,
-                           orig_class=orig_layer_impl,
-                           quantize_fn=quantize_fn)
-
-
-def quantize_module(model, orig_class, quantize_fn):
-    policy = {orig_class: quantize_fn}
-    return _quantize_module(model, policy)
-
-
-def _quantize_module(model, policies):
-    for name, child in model.named_children():
-        if child.__class__ in policies:
-            orig = repr(child)
-            setattr(model, name, policies[child.__class__](child))
-            new = getattr(model, name)
-        else:
-            _quantize_module(child, policies)
-
-    return model
+import copy
+import torch
+import deepspeed
+
+
+def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=False):
+    """ Quantize bert-style transformer layers with DeepSpeed's transformer layer
+    Arguments:
+        orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
+            e.g., transformers.modeling_bert.BertLayer.
+        model (torch.nn.Module): user's nn.module representing their model
+
+        megatron (bool): megatron model-parallel implementation (this is supported for inference only)
+        preln (bool): does the original layer implementation do pre or post layer norm?
+
+        Note: For Bert kind of models, we inject based on the DeepSpeed-Example models, if not setting huggingface flag.
+
+    Returns:
+        Updated nn.module with quantized transformer layers
+    """
+    def quantize_weight(weight):
+        return weight.to(torch.int8)
+
+    def megatron_layer_quantize(layer):
+        layer.attention.query_key_value.weight.data = quantize_weight(
+            layer.attention.query_key_value.weight.data)
+        layer.attention.dense.weight.data = quantize_weight(
+            layer.attention.dense.weight.data)
+        layer.mlp.dense_h_to_4h.weight.data = quantize_weight(
+            layer.mlp.dense_h_to_4h.weight.data)
+        layer.mlp.dense_4h_to_h.weight.data = quantize_weight(
+            layer.mlp.dense_4h_to_h.weight.data)
+
+    def bert_layer_quantize(layer):
+        layer.attention.self.query.weight.data = quantize_weight(
+            layer.attention.self.query.weight.data)
+        layer.attention.self.key.weight.data = quantize_weight(
+            layer.attention.self.key.weight.data)
+        layer.attention.self.value.weight.data = quantize_weight(
+            layer.attention.self.value.weight.data)
+        layer.attention.output.dense.weight.data = quantize_weight(
+            layer.attention.output.dense.weight.data)
+        if preln:
+            layer.intermediate.dense_act.weight.data = quantize_weight(
+                layer.intermediate.dense_act.weight.data)
+        else:
+            layer.intermediate.dense.weight.data = quantize_weight(
+                layer.intermediate.dense.weight.data)
+        layer.output.dense.weight.data = quantize_weight(layer.output.dense.weight.data)
+
+    def quantize_fn(child):
+        if megatron:
+            # Quantize megatron GPT2 / GPT3 trained model
+            megatron_layer_quantize(child)
+        else:
+            # Quantize either DeepSpeed or HuggingFace trained model
+            bert_layer_quantize(child)
+
+        return child
+
+    return quantize_module(model=model,
+                           orig_class=orig_layer_impl,
+                           quantize_fn=quantize_fn)
+
+
+def quantize_module(model, orig_class, quantize_fn):
+    policy = {orig_class: quantize_fn}
+    return _quantize_module(model, policy)
+
+
+def _quantize_module(model, policies):
+    for name, child in model.named_children():
+        if child.__class__ in policies:
+            orig = repr(child)
+            setattr(model, name, policies[child.__class__](child))
+            new = getattr(model, name)
+        else:
+            _quantize_module(child, policies)
+
+    return model
--- a/deepspeed/module_inject/replace_policy.py
+++ b/deepspeed/module_inject/replace_policy.py
@ -1,239 +1,239 @@
-from abc import ABC
-
-import torch
-
-
-class DSPolicy(ABC):
-    def __init__(self, inference=True, linear_layer=True, scale_attention=True):
-        self.inference = inference
-        self.linear_layer = linear_layer
-        self.scale_attention = scale_attention
-
-    def attention(self):
-        """
-        Returns attention qkv and dense parameters
-        weight: (3*hidden, hidden) and (hidden, hidden)
-        bias: (3*hidden) and (hidden)
-        """
-        raise NotImplementedError
-
-    def get_hidden_heads(self):
-        """
-        return hidden_size and number of heads
-        """
-        raise NotImplementedError
-
-    def mlp(self):
-        """
-        Returns mlp intermediate and output
-        weight: (intermediate, hidden) and (hidden, intermediate)
-        bias: (intermediate) and (hidden)
-        """
-        raise NotImplementedError
-
-    def layerNorm(self):
-        """
-        Returns LayerNorms used in transformer layer
-        Post-Attention and pre/post layer norm
-        gamma and beta with shape: (hidden)
-        """
-        raise NotImplementedError
-
-
-class HFBertLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=False, preln=False):
-        super().__init__(inference)
-        self.client_module = client_module
-        self.preln = preln
-        if HFBertLayerPolicy._orig_layer_class is None:
-            try:
-                import transformers
-                HFBertLayerPolicy._orig_layer_class = transformers.models.bert.modeling_bert.BertLayer
-            except:
-                HFBertLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attention.self.query.weight.data.shape[1], \
-                self.client_module.attention.self.num_attention_heads
-
-    def attention(self):
-        qw = self.client_module.attention.self.query.weight.data
-        qb = self.client_module.attention.self.query.bias.data
-        kw = self.client_module.attention.self.key.weight.data
-        kb = self.client_module.attention.self.key.bias.data
-        vw = self.client_module.attention.self.value.weight.data
-        vb = self.client_module.attention.self.value.bias.data
-
-        qkvw = torch.cat((qw, kw, vw), dim=0)
-        qkvb = torch.cat((qb, kb, vb), dim=0)
-
-        return self.linear_layer, \
-               qkvw, \
-               qkvb, \
-               self.client_module.attention.output.dense.weight.data, \
-               self.client_module.attention.output.dense.bias.data, \
-               self.scale_attention
-
-    def mlp(self):
-        if self.preln:
-            intermediate_ff = self.client_module.intermediate.dense_act
-        else:
-            intermediate_ff = self.client_module.intermediate.dense
-
-        return self.linear_layer, intermediate_ff.weight.data, intermediate_ff.bias.data, \
-            self.client_module.output.dense.weight.data, \
-            self.client_module.output.dense.bias.data
-
-    def layerNorm(self):
-        if self.preln:
-            attention_layernorm = self.client_module.PostAttentionLayerNorm
-            transformer_layernorm = self.client_module.PreAttentionLayerNorm
-        else:
-            attention_layernorm = self.client_module.attention.output.LayerNorm
-            transformer_layernorm = self.client_module.output.LayerNorm
-        return attention_layernorm.weight.data, \
-               attention_layernorm.bias.data, \
-               transformer_layernorm.weight.data, \
-               transformer_layernorm.bias.data
-
-
-class HFGPTNEOLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=True):
-        super().__init__(inference, scale_attention=False)
-        self.client_module = client_module
-        try:
-            import transformers
-            HFGPTNEOLayerPolicy._orig_layer_class = transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoBlock
-        except:
-            HFGPTNEOLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attn.attention.q_proj.weight.data.shape[1], \
-                self.client_module.attn.attention.num_heads
-
-    def attention(self):
-        qw = self.client_module.attn.attention.q_proj.weight.data
-        kw = self.client_module.attn.attention.k_proj.weight.data
-        vw = self.client_module.attn.attention.v_proj.weight.data
-
-        qkvw = torch.cat((qw, kw, vw), dim=0)
-
-        return self.linear_layer, \
-                qkvw, \
-                None, \
-                self.client_module.attn.attention.out_proj.weight.data, \
-                self.client_module.attn.attention.out_proj.bias.data, \
-                self.scale_attention
-
-    def mlp(self):
-        return self.linear_layer, \
-                self.client_module.mlp.c_fc.weight.data, \
-                self.client_module.mlp.c_fc.bias.data, \
-                self.client_module.mlp.c_proj.weight.data, \
-                self.client_module.mlp.c_proj.bias.data
-
-    def layerNorm(self):
-        return self.client_module.ln_2.weight.data, \
-               self.client_module.ln_2.bias.data, \
-               self.client_module.ln_1.weight.data, \
-               self.client_module.ln_1.bias.data
-
-
-class MegatronLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, version=0, inference=True):
-        super().__init__(inference)
-        self.client_module = client_module
-        # we use megatron version to differentiate between the old and new
-        # megatron-lm source code
-        self.version = version
-        if MegatronLayerPolicy._orig_layer_class is None:
-            try:
-                import megatron
-                from megatron.model.transformer import ParallelTransformerLayer
-                MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
-            except ImportError:
-                MegatronLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attention.query_key_value.weight.data.shape[1], \
-                self.client_module.attention.num_attention_heads
-
-    def attention(self):
-        if self.inference:
-            if self.version == 0:
-                attention = self.client_module.attention
-            else:
-                attention = self.client_module.self_attention
-
-        return self.linear_layer, \
-                attention.query_key_value.weight.data, \
-                attention.query_key_value.bias.data, \
-                attention.dense.weight.data, \
-                attention.dense.bias.data, \
-                self.scale_attention
-
-    def mlp(self):
-        return self.linear_layer, \
-            self.client_module.mlp.dense_h_to_4h.weight.data, \
-            self.client_module.mlp.dense_h_to_4h.bias.data, \
-            self.client_module.mlp.dense_4h_to_h.weight.data, \
-            self.client_module.mlp.dense_4h_to_h.bias.data
-
-    def layerNorm(self):
-        return self.client_module.post_attention_layernorm.weight.data, \
-               self.client_module.post_attention_layernorm.bias.data, \
-               self.client_module.input_layernorm.weight.data, \
-               self.client_module.input_layernorm.bias.data
-
-
-class HFGPT2LayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=True):
-        # HuggingFace GPT2 uses convolutional layer instead of linear layer
-        super().__init__(inference, linear_layer=False)
-        self.client_module = client_module
-        try:
-            import transformers
-            HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
-        except ImportError:
-            HFGPT2LayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attn.embed_dim, \
-                self.client_module.attn.num_heads
-
-    def attention(self):
-        return self.linear_layer, \
-                self.client_module.attn.c_attn.weight.data, \
-                self.client_module.attn.c_attn.bias.data, \
-                self.client_module.attn.c_proj.weight.data, \
-                self.client_module.attn.c_proj.bias.data, \
-                self.scale_attention
-
-    def mlp(self):
-        return self.linear_layer, \
-            self.client_module.mlp.c_fc.weight.data, \
-            self.client_module.mlp.c_fc.bias.data, \
-            self.client_module.mlp.c_proj.weight.data, \
-            self.client_module.mlp.c_proj.bias.data
-
-    def layerNorm(self):
-        return self.client_module.ln_2.weight.data, \
-               self.client_module.ln_2.bias.data, \
-               self.client_module.ln_1.weight.data, \
-               self.client_module.ln_1.bias.data
-
-
-replace_policies = [
-    HFBertLayerPolicy,
-    HFGPTNEOLayerPolicy,
-    MegatronLayerPolicy,
-    HFGPT2LayerPolicy,
-]
+from abc import ABC
+
+import torch
+
+
+class DSPolicy(ABC):
+    def __init__(self, inference=True, linear_layer=True, scale_attention=True):
+        self.inference = inference
+        self.linear_layer = linear_layer
+        self.scale_attention = scale_attention
+
+    def attention(self):
+        """
+        Returns attention qkv and dense parameters
+        weight: (3*hidden, hidden) and (hidden, hidden)
+        bias: (3*hidden) and (hidden)
+        """
+        raise NotImplementedError
+
+    def get_hidden_heads(self):
+        """
+        return hidden_size and number of heads
+        """
+        raise NotImplementedError
+
+    def mlp(self):
+        """
+        Returns mlp intermediate and output
+        weight: (intermediate, hidden) and (hidden, intermediate)
+        bias: (intermediate) and (hidden)
+        """
+        raise NotImplementedError
+
+    def layerNorm(self):
+        """
+        Returns LayerNorms used in transformer layer
+        Post-Attention and pre/post layer norm
+        gamma and beta with shape: (hidden)
+        """
+        raise NotImplementedError
+
+
+class HFBertLayerPolicy(DSPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=False, preln=False):
+        super().__init__(inference)
+        self.client_module = client_module
+        self.preln = preln
+        if HFBertLayerPolicy._orig_layer_class is None:
+            try:
+                import transformers
+                HFBertLayerPolicy._orig_layer_class = transformers.models.bert.modeling_bert.BertLayer
+            except:
+                HFBertLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.self.query.weight.data.shape[1], \
+                self.client_module.attention.self.num_attention_heads
+
+    def attention(self):
+        qw = self.client_module.attention.self.query.weight.data
+        qb = self.client_module.attention.self.query.bias.data
+        kw = self.client_module.attention.self.key.weight.data
+        kb = self.client_module.attention.self.key.bias.data
+        vw = self.client_module.attention.self.value.weight.data
+        vb = self.client_module.attention.self.value.bias.data
+
+        qkvw = torch.cat((qw, kw, vw), dim=0)
+        qkvb = torch.cat((qb, kb, vb), dim=0)
+
+        return self.linear_layer, \
+               qkvw, \
+               qkvb, \
+               self.client_module.attention.output.dense.weight.data, \
+               self.client_module.attention.output.dense.bias.data, \
+               self.scale_attention
+
+    def mlp(self):
+        if self.preln:
+            intermediate_ff = self.client_module.intermediate.dense_act
+        else:
+            intermediate_ff = self.client_module.intermediate.dense
+
+        return self.linear_layer, intermediate_ff.weight.data, intermediate_ff.bias.data, \
+            self.client_module.output.dense.weight.data, \
+            self.client_module.output.dense.bias.data
+
+    def layerNorm(self):
+        if self.preln:
+            attention_layernorm = self.client_module.PostAttentionLayerNorm
+            transformer_layernorm = self.client_module.PreAttentionLayerNorm
+        else:
+            attention_layernorm = self.client_module.attention.output.LayerNorm
+            transformer_layernorm = self.client_module.output.LayerNorm
+        return attention_layernorm.weight.data, \
+               attention_layernorm.bias.data, \
+               transformer_layernorm.weight.data, \
+               transformer_layernorm.bias.data
+
+
+class HFGPTNEOLayerPolicy(DSPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference, scale_attention=False)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPTNEOLayerPolicy._orig_layer_class = transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoBlock
+        except:
+            HFGPTNEOLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.attention.q_proj.weight.data.shape[1], \
+                self.client_module.attn.attention.num_heads
+
+    def attention(self):
+        qw = self.client_module.attn.attention.q_proj.weight.data
+        kw = self.client_module.attn.attention.k_proj.weight.data
+        vw = self.client_module.attn.attention.v_proj.weight.data
+
+        qkvw = torch.cat((qw, kw, vw), dim=0)
+
+        return self.linear_layer, \
+                qkvw, \
+                None, \
+                self.client_module.attn.attention.out_proj.weight.data, \
+                self.client_module.attn.attention.out_proj.bias.data, \
+                self.scale_attention
+
+    def mlp(self):
+        return self.linear_layer, \
+                self.client_module.mlp.c_fc.weight.data, \
+                self.client_module.mlp.c_fc.bias.data, \
+                self.client_module.mlp.c_proj.weight.data, \
+                self.client_module.mlp.c_proj.bias.data
+
+    def layerNorm(self):
+        return self.client_module.ln_2.weight.data, \
+               self.client_module.ln_2.bias.data, \
+               self.client_module.ln_1.weight.data, \
+               self.client_module.ln_1.bias.data
+
+
+class MegatronLayerPolicy(DSPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, version=0, inference=True):
+        super().__init__(inference)
+        self.client_module = client_module
+        # we use megatron version to differentiate between the old and new
+        # megatron-lm source code
+        self.version = version
+        if MegatronLayerPolicy._orig_layer_class is None:
+            try:
+                import megatron
+                from megatron.model.transformer import ParallelTransformerLayer
+                MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
+            except ImportError:
+                MegatronLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.query_key_value.weight.data.shape[1], \
+                self.client_module.attention.num_attention_heads
+
+    def attention(self):
+        if self.inference:
+            if self.version == 0:
+                attention = self.client_module.attention
+            else:
+                attention = self.client_module.self_attention
+
+        return self.linear_layer, \
+                attention.query_key_value.weight.data, \
+                attention.query_key_value.bias.data, \
+                attention.dense.weight.data, \
+                attention.dense.bias.data, \
+                self.scale_attention
+
+    def mlp(self):
+        return self.linear_layer, \
+            self.client_module.mlp.dense_h_to_4h.weight.data, \
+            self.client_module.mlp.dense_h_to_4h.bias.data, \
+            self.client_module.mlp.dense_4h_to_h.weight.data, \
+            self.client_module.mlp.dense_4h_to_h.bias.data
+
+    def layerNorm(self):
+        return self.client_module.post_attention_layernorm.weight.data, \
+               self.client_module.post_attention_layernorm.bias.data, \
+               self.client_module.input_layernorm.weight.data, \
+               self.client_module.input_layernorm.bias.data
+
+
+class HFGPT2LayerPolicy(DSPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        # HuggingFace GPT2 uses convolutional layer instead of linear layer
+        super().__init__(inference, linear_layer=False)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
+        except ImportError:
+            HFGPT2LayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.embed_dim, \
+                self.client_module.attn.num_heads
+
+    def attention(self):
+        return self.linear_layer, \
+                self.client_module.attn.c_attn.weight.data, \
+                self.client_module.attn.c_attn.bias.data, \
+                self.client_module.attn.c_proj.weight.data, \
+                self.client_module.attn.c_proj.bias.data, \
+                self.scale_attention
+
+    def mlp(self):
+        return self.linear_layer, \
+            self.client_module.mlp.c_fc.weight.data, \
+            self.client_module.mlp.c_fc.bias.data, \
+            self.client_module.mlp.c_proj.weight.data, \
+            self.client_module.mlp.c_proj.bias.data
+
+    def layerNorm(self):
+        return self.client_module.ln_2.weight.data, \
+               self.client_module.ln_2.bias.data, \
+               self.client_module.ln_1.weight.data, \
+               self.client_module.ln_1.bias.data
+
+
+replace_policies = [
+    HFBertLayerPolicy,
+    HFGPTNEOLayerPolicy,
+    MegatronLayerPolicy,
+    HFGPT2LayerPolicy,
+]
--- a/deepspeed/ops/adagrad/cpu_adagrad.py
+++ b/deepspeed/ops/adagrad/cpu_adagrad.py
@ -1,135 +1,135 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
-
-import math
-import torch
-import time
-from pathlib import Path
-from ..op_builder import CPUAdagradBuilder
-from deepspeed.utils.logging import should_log_le
-
-
-class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
-    optimizer_id = 0
-
-    def __init__(self,
-                 model_params,
-                 lr=1e-2,
-                 eps=1e-10,
-                 weight_decay=0,
-                 amsgrad=False,
-                 fp32_optimizer_states=True):
-
-        default_args = dict(lr=lr, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)
-        super(DeepSpeedCPUAdagrad, self).__init__(model_params, default_args)
-
-        self.opt_id = DeepSpeedCPUAdagrad.optimizer_id
-        DeepSpeedCPUAdagrad.optimizer_id = DeepSpeedCPUAdagrad.optimizer_id + 1
-        self.fp32_optimizer_states = fp32_optimizer_states
-        self.ds_opt_adagrad = CPUAdagradBuilder().load()
-
-        self.ds_opt_adagrad.create_adagrad(self.opt_id,
-                                           lr,
-                                           eps,
-                                           weight_decay,
-                                           should_log_le("info"))
-
-    def __del__(self):
-        # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
-        # is used multiple times in the same process (notebook or pytest worker)
-        self.ds_opt_adagrad.destroy_adagrad(self.opt_id)
-
-    def __setstate__(self, state):
-        super(DeepSpeedCPUAdagrad, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-
-    @torch.no_grad()
-    def step(self, closure=None, fp16_param_groups=None):
-        """Update the model parameters.
-
-        .. note::
-            This method will be called internally by ZeRO-Offload. DeepSpeed
-            users should still use ``engine.step()`` as shown in the
-            `Getting Started
-            <https://www.deepspeed.ai/getting-started/#training>`_ guide.
-
-        Args:
-            closure (callable, optional): closure to compute the loss.
-                Defaults to ``None``.
-            fp16_param_groups: FP16 GPU parameters to update. Performing the
-                copy here reduces communication time. Defaults to ``None``.
-
-        Returns:
-            loss: if ``closure`` is provided. Otherwise ``None``.
-        """
-
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group_id, group in enumerate(self.param_groups):
-            for param_id, p in enumerate(group['params']):
-
-                if p.grad is None:
-                    continue
-
-                state = self.state[p]
-                # State initialization
-                if len(state) == 0:
-                    #print(f'group {group_id} param {param_id} = {p.numel()}')
-                    state['step'] = 0
-
-                    #use full precision by default unless self.fp32_optimizer_states is off
-                    state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
-
-                    #memory_format=torch.preserve_format)
-                    # gradient variances
-                    state['exp_avg_sq'] = torch.zeros_like(p.data,
-                                                           dtype=state_dtype,
-                                                           device='cpu')
-                    #memory_format=torch.preserve_format)
-
-                state['step'] += 1
-
-                if p.grad.is_sparse == True:
-                    sparse_param = p.sparse_mask(p.grad)
-                    sparse_exp_avg_sq = state['exp_avg_sq'].sparse_mask(p.grad)
-                    self.ds_opt_adagrad.adagrad_update(self.opt_id,
-                                                       state['step'],
-                                                       group['lr'],
-                                                       group['eps'],
-                                                       group['weight_decay'],
-                                                       sparse_param.values(),
-                                                       p.grad.values(),
-                                                       sparse_exp_avg_sq.values())
-                    p[sparse_param.indices()] = sparse_param.values()
-                    state['exp_avg_sq'][
-                        sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values()
-                    if fp16_param_groups is not None:
-                        fp16_param_groups[group_id][param_id][
-                            sparse_param.indices()] = sparse_param.values()
-                else:
-                    if fp16_param_groups is not None:
-                        self.ds_opt_adagrad.adagrad_update_copy(
-                            self.opt_id,
-                            state['step'],
-                            group['lr'],
-                            group['eps'],
-                            group['weight_decay'],
-                            p.data,
-                            p.grad.data,
-                            state['exp_avg_sq'],
-                            fp16_param_groups[group_id][param_id].data)
-                    else:
-                        self.ds_opt_adagrad.adagrad_update(self.opt_id,
-                                                           state['step'],
-                                                           group['lr'],
-                                                           group['eps'],
-                                                           group['weight_decay'],
-                                                           p.data,
-                                                           p.grad.data,
-                                                           state['exp_avg_sq'])
-        return loss
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+import math
+import torch
+import time
+from pathlib import Path
+from ..op_builder import CPUAdagradBuilder
+from deepspeed.utils.logging import should_log_le
+
+
+class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
+    optimizer_id = 0
+
+    def __init__(self,
+                 model_params,
+                 lr=1e-2,
+                 eps=1e-10,
+                 weight_decay=0,
+                 amsgrad=False,
+                 fp32_optimizer_states=True):
+
+        default_args = dict(lr=lr, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)
+        super(DeepSpeedCPUAdagrad, self).__init__(model_params, default_args)
+
+        self.opt_id = DeepSpeedCPUAdagrad.optimizer_id
+        DeepSpeedCPUAdagrad.optimizer_id = DeepSpeedCPUAdagrad.optimizer_id + 1
+        self.fp32_optimizer_states = fp32_optimizer_states
+        self.ds_opt_adagrad = CPUAdagradBuilder().load()
+
+        self.ds_opt_adagrad.create_adagrad(self.opt_id,
+                                           lr,
+                                           eps,
+                                           weight_decay,
+                                           should_log_le("info"))
+
+    def __del__(self):
+        # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
+        # is used multiple times in the same process (notebook or pytest worker)
+        self.ds_opt_adagrad.destroy_adagrad(self.opt_id)
+
+    def __setstate__(self, state):
+        super(DeepSpeedCPUAdagrad, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None, fp16_param_groups=None):
+        """Update the model parameters.
+
+        .. note::
+            This method will be called internally by ZeRO-Offload. DeepSpeed
+            users should still use ``engine.step()`` as shown in the
+            `Getting Started
+            <https://www.deepspeed.ai/getting-started/#training>`_ guide.
+
+        Args:
+            closure (callable, optional): closure to compute the loss.
+                Defaults to ``None``.
+            fp16_param_groups: FP16 GPU parameters to update. Performing the
+                copy here reduces communication time. Defaults to ``None``.
+
+        Returns:
+            loss: if ``closure`` is provided. Otherwise ``None``.
+        """
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group_id, group in enumerate(self.param_groups):
+            for param_id, p in enumerate(group['params']):
+
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    #print(f'group {group_id} param {param_id} = {p.numel()}')
+                    state['step'] = 0
+
+                    #use full precision by default unless self.fp32_optimizer_states is off
+                    state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
+
+                    #memory_format=torch.preserve_format)
+                    # gradient variances
+                    state['exp_avg_sq'] = torch.zeros_like(p.data,
+                                                           dtype=state_dtype,
+                                                           device='cpu')
+                    #memory_format=torch.preserve_format)
+
+                state['step'] += 1
+
+                if p.grad.is_sparse == True:
+                    sparse_param = p.sparse_mask(p.grad)
+                    sparse_exp_avg_sq = state['exp_avg_sq'].sparse_mask(p.grad)
+                    self.ds_opt_adagrad.adagrad_update(self.opt_id,
+                                                       state['step'],
+                                                       group['lr'],
+                                                       group['eps'],
+                                                       group['weight_decay'],
+                                                       sparse_param.values(),
+                                                       p.grad.values(),
+                                                       sparse_exp_avg_sq.values())
+                    p[sparse_param.indices()] = sparse_param.values()
+                    state['exp_avg_sq'][
+                        sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values()
+                    if fp16_param_groups is not None:
+                        fp16_param_groups[group_id][param_id][
+                            sparse_param.indices()] = sparse_param.values()
+                else:
+                    if fp16_param_groups is not None:
+                        self.ds_opt_adagrad.adagrad_update_copy(
+                            self.opt_id,
+                            state['step'],
+                            group['lr'],
+                            group['eps'],
+                            group['weight_decay'],
+                            p.data,
+                            p.grad.data,
+                            state['exp_avg_sq'],
+                            fp16_param_groups[group_id][param_id].data)
+                    else:
+                        self.ds_opt_adagrad.adagrad_update(self.opt_id,
+                                                           state['step'],
+                                                           group['lr'],
+                                                           group['eps'],
+                                                           group['weight_decay'],
+                                                           p.data,
+                                                           p.grad.data,
+                                                           state['exp_avg_sq'])
+        return loss
--- a/deepspeed/ops/adam/init.py
+++ b/deepspeed/ops/adam/init.py
@ -1,2 +1,2 @@
-from .cpu_adam import DeepSpeedCPUAdam
-from .fused_adam import FusedAdam
+from .cpu_adam import DeepSpeedCPUAdam
+from .fused_adam import FusedAdam
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@ -1,186 +1,186 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
-
-import math
-import torch
-import time
-from pathlib import Path
-from ..op_builder import CPUAdamBuilder
-from deepspeed.utils.logging import should_log_le
-
-
-class DeepSpeedCPUAdam(torch.optim.Optimizer):
-    optimizer_id = 0
-
-    def __init__(self,
-                 model_params,
-                 lr=1e-3,
-                 bias_correction=True,
-                 betas=(0.9,
-                        0.999),
-                 eps=1e-8,
-                 weight_decay=0,
-                 amsgrad=False,
-                 adamw_mode=True,
-                 fp32_optimizer_states=True):
-        """Fast vectorized implementation of two variations of Adam optimizer on CPU:
-
-        * Adam: A Method for Stochastic Optimization: (https://arxiv.org/abs/1412.6980);
-        * AdamW: Fixing Weight Decay Regularization in Adam (https://arxiv.org/abs/1711.05101)
-
-        DeepSpeed CPU Adam(W) provides between 5x to 7x speedup over torch.optim.adam(W).
-        In order to apply this optimizer, the model requires to have its master parameter (in FP32)
-        reside on the CPU memory.
-
-        To train on a heterogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
-        the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory,
-        with minimal impact on training throughput. DeepSpeedCPUAdam plays an important role to minimize
-        the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial
-        (https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology.
-
-        For calling step function, there are two options available: (1) update optimizer's states and (2) update
-        optimizer's states and copy the parameters back to GPU at the same time. We have seen that the second
-        option can bring 30% higher throughput than the doing the copy separately using option one.
-
-
-        .. note::
-                We recommend using our `config
-                <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
-                to allow :meth:`deepspeed.initialize` to build this optimizer
-                for you.
-
-
-        Arguments:
-            model_params (iterable): iterable of parameters to optimize or dicts defining
-                parameter groups.
-            lr (float, optional): learning rate. (default: 1e-3)
-            betas (Tuple[float, float], optional): coefficients used for computing
-                running averages of gradient and its square. (default: (0.9, 0.999))
-            eps (float, optional): term added to the denominator to improve
-                numerical stability. (default: 1e-8)
-            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-            amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-                algorithm from the paper `On the Convergence of Adam and Beyond`_
-                (default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
-            adamw_mode: select between Adam and AdamW implementations (default: AdamW)
-            full_precision_optimizer_states: creates momementum and variance in full precision regardless of
-                        the precision of the parameters (default: True)
-        """
-
-        default_args = dict(lr=lr,
-                            betas=betas,
-                            eps=eps,
-                            weight_decay=weight_decay,
-                            bias_correction=bias_correction,
-                            amsgrad=amsgrad)
-        super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
-
-        self.opt_id = DeepSpeedCPUAdam.optimizer_id
-        DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
-        self.adam_w_mode = adamw_mode
-        self.fp32_optimizer_states = fp32_optimizer_states
-        self.ds_opt_adam = CPUAdamBuilder().load()
-
-        self.ds_opt_adam.create_adam(self.opt_id,
-                                     lr,
-                                     betas[0],
-                                     betas[1],
-                                     eps,
-                                     weight_decay,
-                                     adamw_mode,
-                                     should_log_le("info"))
-
-    def __del__(self):
-        # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
-        # is used multiple times in the same process (notebook or pytest worker)
-        self.ds_opt_adam.destroy_adam(self.opt_id)
-
-    def __setstate__(self, state):
-        super(DeepSpeedCPUAdam, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-
-    @torch.no_grad()
-    def step(self, closure=None, fp16_param_groups=None):
-        """Update the model parameters.
-
-        .. note::
-            This method will be called internally by ZeRO-Offload. DeepSpeed
-            users should still use ``engine.step()`` as shown in the
-            `Getting Started
-            <https://www.deepspeed.ai/getting-started/#training>`_ guide.
-
-        Args:
-            closure (callable, optional): closure to compute the loss.
-                Defaults to ``None``.
-            fp16_param_groups: FP16 GPU parameters to update. Performing the
-                copy here reduces communication time. Defaults to ``None``.
-
-        Returns:
-            loss: if ``closure`` is provided. Otherwise ``None``.
-        """
-
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group_id, group in enumerate(self.param_groups):
-            for param_id, p in enumerate(group['params']):
-
-                if p.grad is None:
-                    continue
-
-                state = self.state[p]
-                # State initialization
-                if len(state) == 0:
-                    #print(f'group {group_id} param {param_id} = {p.numel()}')
-                    state['step'] = 0
-
-                    #use full precision by default unless self.fp32_optimizer_states is off
-                    state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
-
-                    # gradient momentums
-                    state['exp_avg'] = torch.zeros_like(p.data,
-                                                        dtype=state_dtype,
-                                                        device='cpu')
-                    #memory_format=torch.preserve_format)
-                    # gradient variances
-                    state['exp_avg_sq'] = torch.zeros_like(p.data,
-                                                           dtype=state_dtype,
-                                                           device='cpu')
-                    #memory_format=torch.preserve_format)
-
-                state['step'] += 1
-                beta1, beta2 = group['betas']
-
-                if fp16_param_groups is not None:
-                    self.ds_opt_adam.adam_update_copy(
-                        self.opt_id,
-                        state['step'],
-                        group['lr'],
-                        beta1,
-                        beta2,
-                        group['eps'],
-                        group['weight_decay'],
-                        group['bias_correction'],
-                        p.data,
-                        p.grad.data,
-                        state['exp_avg'],
-                        state['exp_avg_sq'],
-                        fp16_param_groups[group_id][param_id].data)
-                else:
-                    self.ds_opt_adam.adam_update(self.opt_id,
-                                                 state['step'],
-                                                 group['lr'],
-                                                 beta1,
-                                                 beta2,
-                                                 group['eps'],
-                                                 group['weight_decay'],
-                                                 group['bias_correction'],
-                                                 p.data,
-                                                 p.grad.data,
-                                                 state['exp_avg'],
-                                                 state['exp_avg_sq'])
-        return loss
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+import math
+import torch
+import time
+from pathlib import Path
+from ..op_builder import CPUAdamBuilder
+from deepspeed.utils.logging import should_log_le
+
+
+class DeepSpeedCPUAdam(torch.optim.Optimizer):
+    optimizer_id = 0
+
+    def __init__(self,
+                 model_params,
+                 lr=1e-3,
+                 bias_correction=True,
+                 betas=(0.9,
+                        0.999),
+                 eps=1e-8,
+                 weight_decay=0,
+                 amsgrad=False,
+                 adamw_mode=True,
+                 fp32_optimizer_states=True):
+        """Fast vectorized implementation of two variations of Adam optimizer on CPU:
+
+        * Adam: A Method for Stochastic Optimization: (https://arxiv.org/abs/1412.6980);
+        * AdamW: Fixing Weight Decay Regularization in Adam (https://arxiv.org/abs/1711.05101)
+
+        DeepSpeed CPU Adam(W) provides between 5x to 7x speedup over torch.optim.adam(W).
+        In order to apply this optimizer, the model requires to have its master parameter (in FP32)
+        reside on the CPU memory.
+
+        To train on a heterogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
+        the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory,
+        with minimal impact on training throughput. DeepSpeedCPUAdam plays an important role to minimize
+        the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial
+        (https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology.
+
+        For calling step function, there are two options available: (1) update optimizer's states and (2) update
+        optimizer's states and copy the parameters back to GPU at the same time. We have seen that the second
+        option can bring 30% higher throughput than the doing the copy separately using option one.
+
+
+        .. note::
+                We recommend using our `config
+                <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
+                to allow :meth:`deepspeed.initialize` to build this optimizer
+                for you.
+
+
+        Arguments:
+            model_params (iterable): iterable of parameters to optimize or dicts defining
+                parameter groups.
+            lr (float, optional): learning rate. (default: 1e-3)
+            betas (Tuple[float, float], optional): coefficients used for computing
+                running averages of gradient and its square. (default: (0.9, 0.999))
+            eps (float, optional): term added to the denominator to improve
+                numerical stability. (default: 1e-8)
+            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+            amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+                algorithm from the paper `On the Convergence of Adam and Beyond`_
+                (default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
+            adamw_mode: select between Adam and AdamW implementations (default: AdamW)
+            full_precision_optimizer_states: creates momementum and variance in full precision regardless of
+                        the precision of the parameters (default: True)
+        """
+
+        default_args = dict(lr=lr,
+                            betas=betas,
+                            eps=eps,
+                            weight_decay=weight_decay,
+                            bias_correction=bias_correction,
+                            amsgrad=amsgrad)
+        super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
+
+        self.opt_id = DeepSpeedCPUAdam.optimizer_id
+        DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
+        self.adam_w_mode = adamw_mode
+        self.fp32_optimizer_states = fp32_optimizer_states
+        self.ds_opt_adam = CPUAdamBuilder().load()
+
+        self.ds_opt_adam.create_adam(self.opt_id,
+                                     lr,
+                                     betas[0],
+                                     betas[1],
+                                     eps,
+                                     weight_decay,
+                                     adamw_mode,
+                                     should_log_le("info"))
+
+    def __del__(self):
+        # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
+        # is used multiple times in the same process (notebook or pytest worker)
+        self.ds_opt_adam.destroy_adam(self.opt_id)
+
+    def __setstate__(self, state):
+        super(DeepSpeedCPUAdam, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None, fp16_param_groups=None):
+        """Update the model parameters.
+
+        .. note::
+            This method will be called internally by ZeRO-Offload. DeepSpeed
+            users should still use ``engine.step()`` as shown in the
+            `Getting Started
+            <https://www.deepspeed.ai/getting-started/#training>`_ guide.
+
+        Args:
+            closure (callable, optional): closure to compute the loss.
+                Defaults to ``None``.
+            fp16_param_groups: FP16 GPU parameters to update. Performing the
+                copy here reduces communication time. Defaults to ``None``.
+
+        Returns:
+            loss: if ``closure`` is provided. Otherwise ``None``.
+        """
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group_id, group in enumerate(self.param_groups):
+            for param_id, p in enumerate(group['params']):
+
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    #print(f'group {group_id} param {param_id} = {p.numel()}')
+                    state['step'] = 0
+
+                    #use full precision by default unless self.fp32_optimizer_states is off
+                    state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
+
+                    # gradient momentums
+                    state['exp_avg'] = torch.zeros_like(p.data,
+                                                        dtype=state_dtype,
+                                                        device='cpu')
+                    #memory_format=torch.preserve_format)
+                    # gradient variances
+                    state['exp_avg_sq'] = torch.zeros_like(p.data,
+                                                           dtype=state_dtype,
+                                                           device='cpu')
+                    #memory_format=torch.preserve_format)
+
+                state['step'] += 1
+                beta1, beta2 = group['betas']
+
+                if fp16_param_groups is not None:
+                    self.ds_opt_adam.adam_update_copy(
+                        self.opt_id,
+                        state['step'],
+                        group['lr'],
+                        beta1,
+                        beta2,
+                        group['eps'],
+                        group['weight_decay'],
+                        group['bias_correction'],
+                        p.data,
+                        p.grad.data,
+                        state['exp_avg'],
+                        state['exp_avg_sq'],
+                        fp16_param_groups[group_id][param_id].data)
+                else:
+                    self.ds_opt_adam.adam_update(self.opt_id,
+                                                 state['step'],
+                                                 group['lr'],
+                                                 beta1,
+                                                 beta2,
+                                                 group['eps'],
+                                                 group['weight_decay'],
+                                                 group['bias_correction'],
+                                                 p.data,
+                                                 p.grad.data,
+                                                 state['exp_avg'],
+                                                 state['exp_avg_sq'])
+        return loss
--- a/deepspeed/ops/aio/init.py
+++ b/deepspeed/ops/aio/init.py
@ -1,6 +1,6 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-'''
-
-from ..op_builder import AsyncIOBuilder
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from ..op_builder import AsyncIOBuilder
--- a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
@ -1,78 +1,78 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-
-from torch import nn
-from deepspeed.ops.sparse_attention import SparseSelfAttention, FixedSparsityConfig
-
-
-class BertSparseSelfAttention(nn.Module):
-    """Implements Sparse Self Attention layer of Bert model based on https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373
-
-    For more information please see, TODO DeepSpeed Sparse Transformer.
-
-    For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
-    """
-    def __init__(
-        self,
-        config,
-        # SparsityConfig parameters needs to be set accordingly
-        sparsity_config=FixedSparsityConfig(num_heads=4)):
-        """Initialize the bert sparse self attention layer.
-
-        Note) you can use any of the provided sparsity configs or simply add yours!
-
-        Arguments:
-            config: required: Bert model config
-            sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on FixedSparsityConfig class.
-        """
-
-        super(BertSparseSelfAttention, self).__init__()
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size,
-                                config.num_attention_heads))
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.sparse_self_attention = SparseSelfAttention(sparsity_config)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(self, hidden_states, attention_mask):
-        """Applies forward phase of bert sparse self attention
-
-        Arguments:
-            hidden_states: required: hidden_states tensor of the bert model
-            attn_mask: required: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported
-
-        Return:
-             context_layer: a dense tensor containing attention context
-        """
-        mixed_query_layer = self.query(hidden_states)
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        context_layer = self.sparse_self_attention(query_layer,
-                                                   key_layer,
-                                                   value_layer,
-                                                   key_padding_mask=attention_mask)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size, )
-        context_layer = context_layer.view(*new_context_layer_shape)
-        return context_layer
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+
+from torch import nn
+from deepspeed.ops.sparse_attention import SparseSelfAttention, FixedSparsityConfig
+
+
+class BertSparseSelfAttention(nn.Module):
+    """Implements Sparse Self Attention layer of Bert model based on https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373
+
+    For more information please see, TODO DeepSpeed Sparse Transformer.
+
+    For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
+    """
+    def __init__(
+        self,
+        config,
+        # SparsityConfig parameters needs to be set accordingly
+        sparsity_config=FixedSparsityConfig(num_heads=4)):
+        """Initialize the bert sparse self attention layer.
+
+        Note) you can use any of the provided sparsity configs or simply add yours!
+
+        Arguments:
+            config: required: Bert model config
+            sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on FixedSparsityConfig class.
+        """
+
+        super(BertSparseSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size,
+                                config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.sparse_self_attention = SparseSelfAttention(sparsity_config)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask):
+        """Applies forward phase of bert sparse self attention
+
+        Arguments:
+            hidden_states: required: hidden_states tensor of the bert model
+            attn_mask: required: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported
+
+        Return:
+             context_layer: a dense tensor containing attention context
+        """
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        context_layer = self.sparse_self_attention(query_layer,
+                                                   key_layer,
+                                                   value_layer,
+                                                   key_padding_mask=attention_mask)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
--- a/deepspeed/runtime/activation_checkpointing/config.py
+++ b/deepspeed/runtime/activation_checkpointing/config.py
@ -1,103 +1,103 @@
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
-
-from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
-
-#########################################
-#  DeepSpeed Activation Checkpointing
-#########################################
-# Activation Checkpointing Allows to save memory by only keeping a select few
-#activations for the backpropagation.
-ACTIVATION_CHKPT_FORMAT = '''
-Activation Checkpointing should be configured as:
-"session_params": {
-  "activation_checkpointing": {
-    "partitioned_activations": [true|false],
-    "number_checkpoints": 100,
-    "contiguous_memory_optimization": [true|false],
-    "cpu_checkpointing": [true|false]
-    "profile": [true|false],
-    "synchronize_checkpoint_boundary": [true|false],
-    }
-}
-'''
-
-ACT_CHKPT_PARTITION_ACTIVATIONS = 'partition_activations'
-ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT = False
-
-ACT_CHKPT_NUMBER_CHECKPOINTS = 'number_checkpoints'
-ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT = None
-
-ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION = 'contiguous_memory_optimization'
-ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT = False
-
-ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY = 'synchronize_checkpoint_boundary'
-ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT = False
-
-ACT_CHKPT_PROFILE = 'profile'
-ACT_CHKPT_PROFILE_DEFAULT = False
-
-ACT_CHKPT_CPU_CHECKPOINTING = 'cpu_checkpointing'
-ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT = False
-
-ACT_CHKPT = 'activation_checkpointing'
-
-ACT_CHKPT_DEFAULT = {
-    ACT_CHKPT_PARTITION_ACTIVATIONS: ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT,
-    ACT_CHKPT_NUMBER_CHECKPOINTS: ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT,
-    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION:
-    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT,
-    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY:
-    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT,
-    ACT_CHKPT_PROFILE: ACT_CHKPT_PROFILE_DEFAULT,
-    ACT_CHKPT_CPU_CHECKPOINTING: ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT
-}
-
-
-class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject):
-    def __init__(self, param_dict):
-        super(DeepSpeedActivationCheckpointingConfig, self).__init__()
-
-        self.partition_activations = None
-        self.contiguous_memory_optimization = None
-        self.cpu_checkpointing = None
-        self.number_checkpoints = None
-        self.synchronize_checkpoint_boundary = None
-        self.profile = None
-
-        if ACT_CHKPT in param_dict.keys():
-            act_chkpt_config_dict = param_dict[ACT_CHKPT]
-        else:
-            act_chkpt_config_dict = ACT_CHKPT_DEFAULT
-
-        self._initialize(act_chkpt_config_dict)
-
-    def _initialize(self, act_chkpt_config_dict):
-        self.partition_activations = get_scalar_param(
-            act_chkpt_config_dict,
-            ACT_CHKPT_PARTITION_ACTIVATIONS,
-            ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT)
-
-        self.contiguous_memory_optimization = get_scalar_param(
-            act_chkpt_config_dict,
-            ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION,
-            ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT)
-
-        self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict,
-                                                  ACT_CHKPT_CPU_CHECKPOINTING,
-                                                  ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT)
-
-        self.number_checkpoints = get_scalar_param(act_chkpt_config_dict,
-                                                   ACT_CHKPT_NUMBER_CHECKPOINTS,
-                                                   ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT)
-
-        self.profile = get_scalar_param(act_chkpt_config_dict,
-                                        ACT_CHKPT_PROFILE,
-                                        ACT_CHKPT_PROFILE_DEFAULT)
-
-        self.synchronize_checkpoint_boundary = get_scalar_param(
-            act_chkpt_config_dict,
-            ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY,
-            ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT)
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
+
+#########################################
+#  DeepSpeed Activation Checkpointing
+#########################################
+# Activation Checkpointing Allows to save memory by only keeping a select few
+#activations for the backpropagation.
+ACTIVATION_CHKPT_FORMAT = '''
+Activation Checkpointing should be configured as:
+"session_params": {
+  "activation_checkpointing": {
+    "partitioned_activations": [true|false],
+    "number_checkpoints": 100,
+    "contiguous_memory_optimization": [true|false],
+    "cpu_checkpointing": [true|false]
+    "profile": [true|false],
+    "synchronize_checkpoint_boundary": [true|false],
+    }
+}
+'''
+
+ACT_CHKPT_PARTITION_ACTIVATIONS = 'partition_activations'
+ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT = False
+
+ACT_CHKPT_NUMBER_CHECKPOINTS = 'number_checkpoints'
+ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT = None
+
+ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION = 'contiguous_memory_optimization'
+ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT = False
+
+ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY = 'synchronize_checkpoint_boundary'
+ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT = False
+
+ACT_CHKPT_PROFILE = 'profile'
+ACT_CHKPT_PROFILE_DEFAULT = False
+
+ACT_CHKPT_CPU_CHECKPOINTING = 'cpu_checkpointing'
+ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT = False
+
+ACT_CHKPT = 'activation_checkpointing'
+
+ACT_CHKPT_DEFAULT = {
+    ACT_CHKPT_PARTITION_ACTIVATIONS: ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT,
+    ACT_CHKPT_NUMBER_CHECKPOINTS: ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT,
+    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION:
+    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT,
+    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY:
+    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT,
+    ACT_CHKPT_PROFILE: ACT_CHKPT_PROFILE_DEFAULT,
+    ACT_CHKPT_CPU_CHECKPOINTING: ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT
+}
+
+
+class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject):
+    def __init__(self, param_dict):
+        super(DeepSpeedActivationCheckpointingConfig, self).__init__()
+
+        self.partition_activations = None
+        self.contiguous_memory_optimization = None
+        self.cpu_checkpointing = None
+        self.number_checkpoints = None
+        self.synchronize_checkpoint_boundary = None
+        self.profile = None
+
+        if ACT_CHKPT in param_dict.keys():
+            act_chkpt_config_dict = param_dict[ACT_CHKPT]
+        else:
+            act_chkpt_config_dict = ACT_CHKPT_DEFAULT
+
+        self._initialize(act_chkpt_config_dict)
+
+    def _initialize(self, act_chkpt_config_dict):
+        self.partition_activations = get_scalar_param(
+            act_chkpt_config_dict,
+            ACT_CHKPT_PARTITION_ACTIVATIONS,
+            ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT)
+
+        self.contiguous_memory_optimization = get_scalar_param(
+            act_chkpt_config_dict,
+            ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION,
+            ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT)
+
+        self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict,
+                                                  ACT_CHKPT_CPU_CHECKPOINTING,
+                                                  ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT)
+
+        self.number_checkpoints = get_scalar_param(act_chkpt_config_dict,
+                                                   ACT_CHKPT_NUMBER_CHECKPOINTS,
+                                                   ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT)
+
+        self.profile = get_scalar_param(act_chkpt_config_dict,
+                                        ACT_CHKPT_PROFILE,
+                                        ACT_CHKPT_PROFILE_DEFAULT)
+
+        self.synchronize_checkpoint_boundary = get_scalar_param(
+            act_chkpt_config_dict,
+            ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY,
+            ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT)
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@ -1,80 +1,80 @@
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
-"""
-Collection of DeepSpeed configuration utilities
-"""
-import json
-import collections
-
-
-# adapted from https://stackoverflow.com/a/50701137/9201239
-class ScientificNotationEncoder(json.JSONEncoder):
-    """
-    This class overrides ``json.dumps`` default formatter.
-
-    This version keeps everything as normal except formats numbers bigger than 1e3 using scientific notation.
-
-    Just pass ``cls=ScientificNotationEncoder`` to ``json.dumps`` to activate it
-
-    """
-    def iterencode(self, o, _one_shot=False, level=0):
-        indent = self.indent if self.indent is not None else 4
-        prefix_close = " " * level * indent
-        level += 1
-        prefix = " " * level * indent
-        if isinstance(o, bool):
-            return "true" if o else "false"
-        elif isinstance(o, float) or isinstance(o, int):
-            if o > 1e3:
-                return f"{o:e}"
-            else:
-                return f"{o}"
-        elif isinstance(o, collections.Mapping):
-            x = [
-                f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k,
-                v in o.items()
-            ]
-            return "{" + ', '.join(x) + f"\n{prefix_close}" + "}"
-        elif isinstance(o, collections.Sequence) and not isinstance(o, str):
-            return f"[{ f', '.join(map(self.iterencode, o)) }]"
-        return "\n, ".join(super().iterencode(o, _one_shot))
-
-
-class DeepSpeedConfigObject(object):
-    """
-    For json serialization
-    """
-    def repr(self):
-        return self.__dict__
-
-    def __repr__(self):
-        return json.dumps(
-            self.__dict__,
-            sort_keys=True,
-            indent=4,
-            cls=ScientificNotationEncoder,
-        )
-
-
-def get_scalar_param(param_dict, param_name, param_default_value):
-    return param_dict.get(param_name, param_default_value)
-
-
-def get_list_param(param_dict, param_name, param_default_value):
-    return param_dict.get(param_name, param_default_value)
-
-
-def get_dict_param(param_dict, param_name, param_default_value):
-    return param_dict.get(param_name, param_default_value)
-
-
-def dict_raise_error_on_duplicate_keys(ordered_pairs):
-    """Reject duplicate keys."""
-    d = dict((k, v) for k, v in ordered_pairs)
-    if len(d) != len(ordered_pairs):
-        counter = collections.Counter([pair[0] for pair in ordered_pairs])
-        keys = [key for key, value in counter.items() if value > 1]
-        raise ValueError("Duplicate keys in DeepSpeed config: {}".format(keys))
-    return d
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+"""
+Collection of DeepSpeed configuration utilities
+"""
+import json
+import collections
+
+
+# adapted from https://stackoverflow.com/a/50701137/9201239
+class ScientificNotationEncoder(json.JSONEncoder):
+    """
+    This class overrides ``json.dumps`` default formatter.
+
+    This version keeps everything as normal except formats numbers bigger than 1e3 using scientific notation.
+
+    Just pass ``cls=ScientificNotationEncoder`` to ``json.dumps`` to activate it
+
+    """
+    def iterencode(self, o, _one_shot=False, level=0):
+        indent = self.indent if self.indent is not None else 4
+        prefix_close = " " * level * indent
+        level += 1
+        prefix = " " * level * indent
+        if isinstance(o, bool):
+            return "true" if o else "false"
+        elif isinstance(o, float) or isinstance(o, int):
+            if o > 1e3:
+                return f"{o:e}"
+            else:
+                return f"{o}"
+        elif isinstance(o, collections.Mapping):
+            x = [
+                f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k,
+                v in o.items()
+            ]
+            return "{" + ', '.join(x) + f"\n{prefix_close}" + "}"
+        elif isinstance(o, collections.Sequence) and not isinstance(o, str):
+            return f"[{ f', '.join(map(self.iterencode, o)) }]"
+        return "\n, ".join(super().iterencode(o, _one_shot))
+
+
+class DeepSpeedConfigObject(object):
+    """
+    For json serialization
+    """
+    def repr(self):
+        return self.__dict__
+
+    def __repr__(self):
+        return json.dumps(
+            self.__dict__,
+            sort_keys=True,
+            indent=4,
+            cls=ScientificNotationEncoder,
+        )
+
+
+def get_scalar_param(param_dict, param_name, param_default_value):
+    return param_dict.get(param_name, param_default_value)
+
+
+def get_list_param(param_dict, param_name, param_default_value):
+    return param_dict.get(param_name, param_default_value)
+
+
+def get_dict_param(param_dict, param_name, param_default_value):
+    return param_dict.get(param_name, param_default_value)
+
+
+def dict_raise_error_on_duplicate_keys(ordered_pairs):
+    """Reject duplicate keys."""
+    d = dict((k, v) for k, v in ordered_pairs)
+    if len(d) != len(ordered_pairs):
+        counter = collections.Counter([pair[0] for pair in ordered_pairs])
+        keys = [key for key, value in counter.items() if value > 1]
+        raise ValueError("Duplicate keys in DeepSpeed config: {}".format(keys))
+    return d
--- a/deepspeed/runtime/eigenvalue.py
+++ b/deepspeed/runtime/eigenvalue.py
@ -1,152 +1,152 @@
-import torch
-from deepspeed.utils import log_dist
-import numpy as np
-import logging
-
-
-class Eigenvalue(object):
-    def __init__(self,
-                 verbose=False,
-                 max_iter=100,
-                 tol=1e-2,
-                 stability=0,
-                 gas_boundary_resolution=1,
-                 layer_name='',
-                 layer_num=0):
-        super().__init__()
-
-        self.verbose = verbose
-        self.max_iter = max_iter
-        self.tol = tol
-        self.stability = stability
-        self.gas_boundary_resolution = gas_boundary_resolution
-        self.layer_name = layer_name
-        self.layer_num = layer_num
-
-        assert len(self.layer_name) > 0 and layer_num > 0
-
-        log_dist(
-            f'enabled eigenvalue with verbose={verbose}, max_iter={max_iter}, tol={tol}, stability={stability}, gas_boundary_resolution={gas_boundary_resolution}, layer_name={layer_name}, layer_num={layer_num}',
-            ranks=[0])
-
-    # Replace all nan/pos-inf/neg-inf to zero
-    # TODO: Pytorch new version may add this function, replace this one by then.
-    def nan_to_num(self, x):
-        device = x.device
-        x = x.cpu().numpy()
-        x = np.nan_to_num(x=x, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
-        return torch.from_numpy(x).to(device)
-
-    def normalize(self, v):
-        norm_squared = self.inner_product(v, v)
-        norm = norm_squared**0.5 + self.stability
-        normalized_vectors = [vector / norm for vector in v]
-        normalized_vectors = [self.nan_to_num(vector) for vector in normalized_vectors]
-        return normalized_vectors
-
-    def inner_product(self, xs, ys):
-        return sum([torch.sum(x * y) for (x, y) in zip(xs, ys)])
-
-    def get_layers(self, module):
-        scope_names = self.layer_name.split('.')
-        assert len(scope_names) > 0
-
-        m = module
-        for name in scope_names:
-            assert hasattr(m, name), "layer_name configuration is invalid."
-            m = getattr(m, name)
-
-        return m
-
-    def compute_eigenvalue(self, module, device=None, scale=1.0):
-        block_eigenvalue = []
-        param_keys = []
-        layers = self.get_layers(module)
-
-        for block in range(self.layer_num):
-            model_block = layers[block]
-
-            # We found this randn() has obvious accuracy impact in some cases, save/recover random state here.
-            rng_state = torch.random.get_rng_state()
-            if device is None:
-                v = [
-                    torch.randn(p.size()) for p in model_block.parameters()
-                    if p.grad is not None and p.grad.grad_fn is not None
-                ]
-            else:
-                v = [
-                    torch.randn(p.size(),
-                                device=device) for p in model_block.parameters()
-                    if p.grad is not None and p.grad.grad_fn is not None
-                ]
-            torch.random.set_rng_state(rng_state)
-
-            grads = [
-                param.grad for param in model_block.parameters()
-                if param.grad is not None and param.grad.grad_fn is not None
-            ]
-            params = [
-                param for param in model_block.parameters()
-                if param.grad is not None and param.grad.grad_fn is not None
-            ]
-
-            layer_keys = [id(p) for p in model_block.parameters()]
-            param_keys.append(layer_keys)
-
-            v = self.normalize(v)
-
-            # Disable eigenvalue if the model doesn't support second order gradients computation,
-            # e.g. when enabling DS transformer kernel.
-            if len(grads) == 0 or len(params) == 0:
-                log_dist(f'The model does NOT support eigenvalue computation.',
-                         ranks=[0],
-                         level=logging.WARNING)
-                return []
-
-            i = 0
-            eigenvalue_current, eigenvalue_previous = 1., 0.
-
-            while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs(
-                (eigenvalue_current - eigenvalue_previous) /
-                    eigenvalue_current) >= self.tol):  # test convergence criteria
-                eigenvalue_previous = eigenvalue_current
-
-                Hv = torch.autograd.grad(grads,
-                                         params,
-                                         grad_outputs=v,
-                                         only_inputs=True,
-                                         retain_graph=True)
-                #Hv = [hv.float() for hv in Hv]
-                Hv = [self.nan_to_num(hv).float() for hv in Hv]
-
-                eigenvalue_current = self.inner_product(Hv, v).item()
-
-                v = self.normalize(Hv)
-                v = [x / scale for x in v]
-                i += 1
-
-            eigenvalue_current *= scale
-            block_eigenvalue.append(eigenvalue_current)
-
-            if self.verbose:
-                log_dist(
-                    f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}',
-                    ranks=[0])
-
-        block_eigenvalue = self.post_process(block_eigenvalue)
-
-        if self.verbose:
-            log_dist(f'post processed block_eigenvalue: {block_eigenvalue}', ranks=[0])
-
-        # {param_id: (eigenvalue, layer_id)}
-        ev_dict = {}
-        for i, (layer_keys, value) in enumerate(zip(param_keys, block_eigenvalue)):
-            ev_dict.update(dict.fromkeys(layer_keys, (value, i)))
-
-        return ev_dict
-
-    # 1. Map all eigenvalues to [0, 1.0].
-    # 2. Some layers can't generate valid eigenvalues on fp16 precision, use 1.0 instead.
-    def post_process(self, value_list):
-        max_value = abs(max(value_list, key=abs))
-        return [abs(v) / max_value if v != 0.0 else 1.0 for v in value_list]
+import torch
+from deepspeed.utils import log_dist
+import numpy as np
+import logging
+
+
+class Eigenvalue(object):
+    def __init__(self,
+                 verbose=False,
+                 max_iter=100,
+                 tol=1e-2,
+                 stability=0,
+                 gas_boundary_resolution=1,
+                 layer_name='',
+                 layer_num=0):
+        super().__init__()
+
+        self.verbose = verbose
+        self.max_iter = max_iter
+        self.tol = tol
+        self.stability = stability
+        self.gas_boundary_resolution = gas_boundary_resolution
+        self.layer_name = layer_name
+        self.layer_num = layer_num
+
+        assert len(self.layer_name) > 0 and layer_num > 0
+
+        log_dist(
+            f'enabled eigenvalue with verbose={verbose}, max_iter={max_iter}, tol={tol}, stability={stability}, gas_boundary_resolution={gas_boundary_resolution}, layer_name={layer_name}, layer_num={layer_num}',
+            ranks=[0])
+
+    # Replace all nan/pos-inf/neg-inf to zero
+    # TODO: Pytorch new version may add this function, replace this one by then.
+    def nan_to_num(self, x):
+        device = x.device
+        x = x.cpu().numpy()
+        x = np.nan_to_num(x=x, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
+        return torch.from_numpy(x).to(device)
+
+    def normalize(self, v):
+        norm_squared = self.inner_product(v, v)
+        norm = norm_squared**0.5 + self.stability
+        normalized_vectors = [vector / norm for vector in v]
+        normalized_vectors = [self.nan_to_num(vector) for vector in normalized_vectors]
+        return normalized_vectors
+
+    def inner_product(self, xs, ys):
+        return sum([torch.sum(x * y) for (x, y) in zip(xs, ys)])
+
+    def get_layers(self, module):
+        scope_names = self.layer_name.split('.')
+        assert len(scope_names) > 0
+
+        m = module
+        for name in scope_names:
+            assert hasattr(m, name), "layer_name configuration is invalid."
+            m = getattr(m, name)
+
+        return m
+
+    def compute_eigenvalue(self, module, device=None, scale=1.0):
+        block_eigenvalue = []
+        param_keys = []
+        layers = self.get_layers(module)
+
+        for block in range(self.layer_num):
+            model_block = layers[block]
+
+            # We found this randn() has obvious accuracy impact in some cases, save/recover random state here.
+            rng_state = torch.random.get_rng_state()
+            if device is None:
+                v = [
+                    torch.randn(p.size()) for p in model_block.parameters()
+                    if p.grad is not None and p.grad.grad_fn is not None
+                ]
+            else:
+                v = [
+                    torch.randn(p.size(),
+                                device=device) for p in model_block.parameters()
+                    if p.grad is not None and p.grad.grad_fn is not None
+                ]
+            torch.random.set_rng_state(rng_state)
+
+            grads = [
+                param.grad for param in model_block.parameters()
+                if param.grad is not None and param.grad.grad_fn is not None
+            ]
+            params = [
+                param for param in model_block.parameters()
+                if param.grad is not None and param.grad.grad_fn is not None
+            ]
+
+            layer_keys = [id(p) for p in model_block.parameters()]
+            param_keys.append(layer_keys)
+
+            v = self.normalize(v)
+
+            # Disable eigenvalue if the model doesn't support second order gradients computation,
+            # e.g. when enabling DS transformer kernel.
+            if len(grads) == 0 or len(params) == 0:
+                log_dist(f'The model does NOT support eigenvalue computation.',
+                         ranks=[0],
+                         level=logging.WARNING)
+                return []
+
+            i = 0
+            eigenvalue_current, eigenvalue_previous = 1., 0.
+
+            while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs(
+                (eigenvalue_current - eigenvalue_previous) /
+                    eigenvalue_current) >= self.tol):  # test convergence criteria
+                eigenvalue_previous = eigenvalue_current
+
+                Hv = torch.autograd.grad(grads,
+                                         params,
+                                         grad_outputs=v,
+                                         only_inputs=True,
+                                         retain_graph=True)
+                #Hv = [hv.float() for hv in Hv]
+                Hv = [self.nan_to_num(hv).float() for hv in Hv]
+
+                eigenvalue_current = self.inner_product(Hv, v).item()
+
+                v = self.normalize(Hv)
+                v = [x / scale for x in v]
+                i += 1
+
+            eigenvalue_current *= scale
+            block_eigenvalue.append(eigenvalue_current)
+
+            if self.verbose:
+                log_dist(
+                    f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}',
+                    ranks=[0])
+
+        block_eigenvalue = self.post_process(block_eigenvalue)
+
+        if self.verbose:
+            log_dist(f'post processed block_eigenvalue: {block_eigenvalue}', ranks=[0])
+
+        # {param_id: (eigenvalue, layer_id)}
+        ev_dict = {}
+        for i, (layer_keys, value) in enumerate(zip(param_keys, block_eigenvalue)):
+            ev_dict.update(dict.fromkeys(layer_keys, (value, i)))
+
+        return ev_dict
+
+    # 1. Map all eigenvalues to [0, 1.0].
+    # 2. Some layers can't generate valid eigenvalues on fp16 precision, use 1.0 instead.
+    def post_process(self, value_list):
+        max_value = abs(max(value_list, key=abs))
+        return [abs(v) / max_value if v != 0.0 else 1.0 for v in value_list]
--- a/deepspeed/runtime/progressive_layer_drop.py
+++ b/deepspeed/runtime/progressive_layer_drop.py
@ -1,33 +1,33 @@
-import numpy as np
-from deepspeed.utils import log_dist
-
-
-class ProgressiveLayerDrop(object):
-    r""" Progressive Layer Dropping (PLD) for model training.
-        This implements the PLD technique for compressed model training
-        from this paper: https://arxiv.org/pdf/2010.13369.pdf
-    Args:
-        theta (float): a hyper-parameter that controls the trade-off between training time and robustness.
-        The lower the theta value, the faster the training speed. Default value: 0.5.
-        gamma (float): a hyper-parameter that controls how fast the drop ratio increases. Default value: 0.001.
-    """
-    def __init__(self, theta=0.5, gamma=0.001):
-        super().__init__()
-
-        self.theta = theta
-        self.gamma = gamma
-        self.current_theta = 1.0
-        log_dist(f'Enabled progressive layer dropping (theta = {self.theta})', ranks=[0])
-
-    def get_state(self):
-        kwargs = {'progressive_layer_drop': True, 'pld_theta': self.get_theta()}
-        return kwargs
-
-    def get_theta(self):
-        return self.current_theta
-
-    def update_state(self, global_step):
-        def _prob(x, gamma, p):
-            return (1. - p) * np.exp(-gamma * x) + p
-
-        self.current_theta = _prob(global_step, self.gamma, self.theta)
+import numpy as np
+from deepspeed.utils import log_dist
+
+
+class ProgressiveLayerDrop(object):
+    r""" Progressive Layer Dropping (PLD) for model training.
+        This implements the PLD technique for compressed model training
+        from this paper: https://arxiv.org/pdf/2010.13369.pdf
+    Args:
+        theta (float): a hyper-parameter that controls the trade-off between training time and robustness.
+        The lower the theta value, the faster the training speed. Default value: 0.5.
+        gamma (float): a hyper-parameter that controls how fast the drop ratio increases. Default value: 0.001.
+    """
+    def __init__(self, theta=0.5, gamma=0.001):
+        super().__init__()
+
+        self.theta = theta
+        self.gamma = gamma
+        self.current_theta = 1.0
+        log_dist(f'Enabled progressive layer dropping (theta = {self.theta})', ranks=[0])
+
+    def get_state(self):
+        kwargs = {'progressive_layer_drop': True, 'pld_theta': self.get_theta()}
+        return kwargs
+
+    def get_theta(self):
+        return self.current_theta
+
+    def update_state(self, global_step):
+        def _prob(x, gamma, p):
+            return (1. - p) * np.exp(-gamma * x) + p
+
+        self.current_theta = _prob(global_step, self.gamma, self.theta)
--- a/deepspeed/runtime/quantize.py
+++ b/deepspeed/runtime/quantize.py
@ -1,224 +1,224 @@
-import torch
-import math
-from deepspeed.utils import log_dist
-from deepspeed.utils import logger
-from deepspeed.ops.quantizer import ds_quantizer
-
-# number of 2-dimensional parameters in a layer
-# this is set for transformer-based models
-TWO_D_PARAMS = 6
-
-
-class Quantizer(object):
-    def __init__(self,
-                 q_target_bits=8,
-                 q_start_bits=16,
-                 q_period=100,
-                 q_offset=100,
-                 q_groups=1,
-                 q_mixed_fp16=False,
-                 q_change_ratio=0.01,
-                 q_type=0,
-                 q_rounding=0,
-                 q_verbose=False,
-                 q_eigenvalue=False,
-                 use_quantizer_kernel=False,
-                 layer_num=0):
-
-        self.q_target_bits = q_target_bits
-
-        self.q_start_bits = [q_start_bits] * (layer_num if layer_num != 0 else 1)
-        self.q_period = [q_period] * (layer_num if layer_num != 0 else 1)
-        self.q_offset = q_offset
-        self.q_groups = q_groups
-        self.q_mixed_fp16 = q_mixed_fp16
-        self.q_change_ratio = q_change_ratio
-        self.q_type = q_type
-        self.qsteps = 0
-        self.q_init_period = q_period
-        self.quantize_real_ratio = 1.000
-        self.q_verbose = q_verbose
-        self.q_eigenvalue = q_eigenvalue
-        self.use_quantizer_kernel = use_quantizer_kernel
-        self.q_rounding = q_rounding
-        self.layer_num = layer_num
-
-    def any_precision_switch(self):
-        if self.layer_num == 0:
-            return True
-        result = False
-        for index in range(self.layer_num):
-            if self.q_start_bits[index] != self.q_target_bits:
-                next_step = self.qsteps + (
-                    TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1))
-                if next_step >= self.q_period[index]:
-                    result = True
-        return result
-
-    def quantize(self,
-                 parameter_group,
-                 overflow,
-                 eigenvalue_enabled,
-                 block_eigenvalue={}):
-
-        if overflow and not eigenvalue_enabled:
-            return
-
-        self.step()
-
-        self.update_fp16_ratio()
-
-        for i in range(len(parameter_group)):
-            for p in parameter_group[i]:
-                if len(p.size()) > 1:
-                    param_id = id(p)
-                    eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None, 0)
-                    if eigenvalue is not None:
-                        factor = 1 + math.floor(eigenvalue * 4)
-                        p.data = self.compute_quantization(p.data, layer_id, factor)
-                    else:
-                        p.data = self.compute_quantization(p.data, layer_id)
-
-    def step(self):
-        self.qsteps += (TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1))
-
-    def sr_quantize(self, input_flat, input_g, scale):
-        # Random number generator (Uniform)
-        p = torch.cuda.FloatTensor(input_flat.size(),
-                                   device=input_flat.device).uniform_()
-        p = torch.split(p, p.size(0) // self.q_groups)
-        add_s = torch.zeros_like(input_flat)
-        add_s = torch.split(add_s, add_s.size(0) // self.q_groups)
-
-        scale = [q_range / (2 * max(g.max(), g.min().abs())) for g in input_g]
-        # Quantize with INT rounding
-        input_flat = [(g * s).int().float() / s for (g, s) in zip(input_g, scale)]
-        # Compute the error
-        error = [((g - q).abs() / s) for (g, s, q) in zip(input_g, scale, input_flat)]
-        # Stochastic Rounding
-        add_s = [
-            a_s.masked_fill_(pg < err_g,
-                             1 / s) for (a_s,
-                                         pg,
-                                         err_g,
-                                         s) in zip(add_s,
-                                                   p,
-                                                   error,
-                                                   scale)
-        ]
-        add_s = [
-            a_s * (g > 0).float() - a_s * (g < 0).float() for a_s,
-            g in zip(add_s,
-                     input_flat)
-        ]
-        input_flat = [((q + a_s) * s).clamp(-(q_range >> 1),
-                                            (q_range >> 1) - 1) / s for q,
-                      a_s,
-                      s in zip(input_flat,
-                               add_s,
-                               scale)]
-        return input_flat
-
-    def mixed_fp16_quantize(self, input, input_q, index):
-        if self.q_mixed_fp16 and self.q_start_bits[index] >= (self.q_target_bits - 1):
-            input_q = input * self.quantize_real_ratio + (
-                1 - self.quantize_real_ratio) * input_q
-            return input_q
-        return input_q
-
-    def compute_quantization(self, input, index=0, factor=1):
-        # fixing the quantization bits based on the training steps
-        # when reducing 1 bit at each period, we increase the period
-        # to go slowly toward the target quantization bits
-        # the period and starting bit can be configured
-        if self.q_offset > 0:
-            if self.qsteps >= self.q_offset:
-                self.q_offset = 0
-                self.qsteps = 0
-            else:
-                return input
-
-        if self.q_start_bits[index] != self.q_target_bits:
-            if self.qsteps >= self.q_period[index]:
-                self.quantize_real_ratio = 1.0
-                if self.q_eigenvalue:
-                    self.q_period[index] <<= 1
-                    self.q_period[index] *= factor
-                    self.q_start_bits[index] -= 1
-                else:
-                    for i in range(len(self.q_start_bits)):
-                        self.q_start_bits[i] -= 1
-                        self.q_period[i] <<= 1
-                if self.q_verbose:
-                    logger.info(
-                        f'Quantization settings: current bit-precision = {self.q_start_bits[index]}, step = {self.qsteps}, quantization period = {self.q_period[index]}, index = {index}'
-                    )
-        assert (self.q_start_bits[index] >= self.q_target_bits), \
-            'Quantization bit is lower than target precision bits!'
-
-        # quantize the weights base on the selected bits and the value-range
-        if not self.use_quantizer_kernel:
-            q_range = 2**self.q_start_bits[index]
-            input_flat = input.view(-1)
-            input_g = torch.split(input_flat, input_flat.size(0) // self.q_groups)
-        if self.q_type == 0:  #symmetric
-            if self.use_quantizer_kernel:
-                input_q = ds_quantizer(input.clone(),
-                                       self.q_groups,
-                                       self.q_start_bits[index])
-            else:
-                scale = [q_range / (2 * max(g.max(), g.min().abs())) for g in input_g]
-                if self.q_rounding == 0:  # Nearest value rounding
-                    input_flat = [(g * s).round().clamp(-(q_range >> 1),
-                                                        (q_range >> 1) - 1) / s for g,
-                                  s in zip(input_g,
-                                           scale)]
-                else:  # Stochastic Rounding
-                    if self.use_quantizer_kernel:
-                        input_q = ds_quantizer(input.clone(),
-                                               self.q_groups,
-                                               self.q_start_bits[index],
-                                               sr=True)
-                    else:
-                        input_flat = self.sr_quantize(input_flat, input_g)
-        else:  #asymmetric
-            if self.q_rounding == 0:
-                if self.use_quantizer_kernel:
-                    input_q = ds_quantizer(input.clone(),
-                                           self.q_groups,
-                                           self.q_start_bits[index],
-                                           asym=True)
-                else:
-                    scale = [(g.max() - g.min()) / q_range for g in input_g]
-                    input_flat = [
-                        ((g - g.min()) / s).round().clamp(0,
-                                                          (q_range - 1)) * s + g.min()
-                        for g,
-                        s in zip(input_g,
-                                 scale)
-                    ]
-            else:
-                input_q = ds_quantizer(input.clone(),
-                                       self.q_groups,
-                                       self.q_start_bits[index],
-                                       asym=True)
-
-        if self.use_quantizer_kernel or (self.q_type and self.q_rounding):
-            return self.mixed_fp16_quantize(input, input_q, index)
-        else:
-            if self.q_mixed_fp16 and self.q_start_bits[index] >= (self.q_target_bits -
-                                                                  1):
-                input_flat = [(self.quantize_real_ratio * g) +
-                              ((1 - self.quantize_real_ratio) * g_q) for g,
-                              g_q in zip(input_g,
-                                         input_flat)]
-            input_q = torch.cat(input_flat)
-            input_q = input_q.reshape(input.size())
-            return input_q
-
-    def update_fp16_ratio(self):
-        if self.q_mixed_fp16:
-            if self.quantize_real_ratio > 0:
-                self.quantize_real_ratio -= self.q_change_ratio
-            else:
-                self.quantize_real_ratio = 0.000
+import torch
+import math
+from deepspeed.utils import log_dist
+from deepspeed.utils import logger
+from deepspeed.ops.quantizer import ds_quantizer
+
+# number of 2-dimensional parameters in a layer
+# this is set for transformer-based models
+TWO_D_PARAMS = 6
+
+
+class Quantizer(object):
+    def __init__(self,
+                 q_target_bits=8,
+                 q_start_bits=16,
+                 q_period=100,
+                 q_offset=100,
+                 q_groups=1,
+                 q_mixed_fp16=False,
+                 q_change_ratio=0.01,
+                 q_type=0,
+                 q_rounding=0,
+                 q_verbose=False,
+                 q_eigenvalue=False,
+                 use_quantizer_kernel=False,
+                 layer_num=0):
+
+        self.q_target_bits = q_target_bits
+
+        self.q_start_bits = [q_start_bits] * (layer_num if layer_num != 0 else 1)
+        self.q_period = [q_period] * (layer_num if layer_num != 0 else 1)
+        self.q_offset = q_offset
+        self.q_groups = q_groups
+        self.q_mixed_fp16 = q_mixed_fp16
+        self.q_change_ratio = q_change_ratio
+        self.q_type = q_type
+        self.qsteps = 0
+        self.q_init_period = q_period
+        self.quantize_real_ratio = 1.000
+        self.q_verbose = q_verbose
+        self.q_eigenvalue = q_eigenvalue
+        self.use_quantizer_kernel = use_quantizer_kernel
+        self.q_rounding = q_rounding
+        self.layer_num = layer_num
+
+    def any_precision_switch(self):
+        if self.layer_num == 0:
+            return True
+        result = False
+        for index in range(self.layer_num):
+            if self.q_start_bits[index] != self.q_target_bits:
+                next_step = self.qsteps + (
+                    TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1))
+                if next_step >= self.q_period[index]:
+                    result = True
+        return result
+
+    def quantize(self,
+                 parameter_group,
+                 overflow,
+                 eigenvalue_enabled,
+                 block_eigenvalue={}):
+
+        if overflow and not eigenvalue_enabled:
+            return
+
+        self.step()
+
+        self.update_fp16_ratio()
+
+        for i in range(len(parameter_group)):
+            for p in parameter_group[i]:
+                if len(p.size()) > 1:
+                    param_id = id(p)
+                    eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None, 0)
+                    if eigenvalue is not None:
+                        factor = 1 + math.floor(eigenvalue * 4)
+                        p.data = self.compute_quantization(p.data, layer_id, factor)
+                    else:
+                        p.data = self.compute_quantization(p.data, layer_id)
+
+    def step(self):
+        self.qsteps += (TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1))
+
+    def sr_quantize(self, input_flat, input_g, scale):
+        # Random number generator (Uniform)
+        p = torch.cuda.FloatTensor(input_flat.size(),
+                                   device=input_flat.device).uniform_()
+        p = torch.split(p, p.size(0) // self.q_groups)
+        add_s = torch.zeros_like(input_flat)
+        add_s = torch.split(add_s, add_s.size(0) // self.q_groups)
+
+        scale = [q_range / (2 * max(g.max(), g.min().abs())) for g in input_g]
+        # Quantize with INT rounding
+        input_flat = [(g * s).int().float() / s for (g, s) in zip(input_g, scale)]
+        # Compute the error
+        error = [((g - q).abs() / s) for (g, s, q) in zip(input_g, scale, input_flat)]
+        # Stochastic Rounding
+        add_s = [
+            a_s.masked_fill_(pg < err_g,
+                             1 / s) for (a_s,
+                                         pg,
+                                         err_g,
+                                         s) in zip(add_s,
+                                                   p,
+                                                   error,
+                                                   scale)
+        ]
+        add_s = [
+            a_s * (g > 0).float() - a_s * (g < 0).float() for a_s,
+            g in zip(add_s,
+                     input_flat)
+        ]
+        input_flat = [((q + a_s) * s).clamp(-(q_range >> 1),
+                                            (q_range >> 1) - 1) / s for q,
+                      a_s,
+                      s in zip(input_flat,
+                               add_s,
+                               scale)]
+        return input_flat
+
+    def mixed_fp16_quantize(self, input, input_q, index):
+        if self.q_mixed_fp16 and self.q_start_bits[index] >= (self.q_target_bits - 1):
+            input_q = input * self.quantize_real_ratio + (
+                1 - self.quantize_real_ratio) * input_q
+            return input_q
+        return input_q
+
+    def compute_quantization(self, input, index=0, factor=1):
+        # fixing the quantization bits based on the training steps
+        # when reducing 1 bit at each period, we increase the period
+        # to go slowly toward the target quantization bits
+        # the period and starting bit can be configured
+        if self.q_offset > 0:
+            if self.qsteps >= self.q_offset:
+                self.q_offset = 0
+                self.qsteps = 0
+            else:
+                return input
+
+        if self.q_start_bits[index] != self.q_target_bits:
+            if self.qsteps >= self.q_period[index]:
+                self.quantize_real_ratio = 1.0
+                if self.q_eigenvalue:
+                    self.q_period[index] <<= 1
+                    self.q_period[index] *= factor
+                    self.q_start_bits[index] -= 1
+                else:
+                    for i in range(len(self.q_start_bits)):
+                        self.q_start_bits[i] -= 1
+                        self.q_period[i] <<= 1
+                if self.q_verbose:
+                    logger.info(
+                        f'Quantization settings: current bit-precision = {self.q_start_bits[index]}, step = {self.qsteps}, quantization period = {self.q_period[index]}, index = {index}'
+                    )
+        assert (self.q_start_bits[index] >= self.q_target_bits), \
+            'Quantization bit is lower than target precision bits!'
+
+        # quantize the weights base on the selected bits and the value-range
+        if not self.use_quantizer_kernel:
+            q_range = 2**self.q_start_bits[index]
+            input_flat = input.view(-1)
+            input_g = torch.split(input_flat, input_flat.size(0) // self.q_groups)
+        if self.q_type == 0:  #symmetric
+            if self.use_quantizer_kernel:
+                input_q = ds_quantizer(input.clone(),
+                                       self.q_groups,
+                                       self.q_start_bits[index])
+            else:
+                scale = [q_range / (2 * max(g.max(), g.min().abs())) for g in input_g]
+                if self.q_rounding == 0:  # Nearest value rounding
+                    input_flat = [(g * s).round().clamp(-(q_range >> 1),
+                                                        (q_range >> 1) - 1) / s for g,
+                                  s in zip(input_g,
+                                           scale)]
+                else:  # Stochastic Rounding
+                    if self.use_quantizer_kernel:
+                        input_q = ds_quantizer(input.clone(),
+                                               self.q_groups,
+                                               self.q_start_bits[index],
+                                               sr=True)
+                    else:
+                        input_flat = self.sr_quantize(input_flat, input_g)
+        else:  #asymmetric
+            if self.q_rounding == 0:
+                if self.use_quantizer_kernel:
+                    input_q = ds_quantizer(input.clone(),
+                                           self.q_groups,
+                                           self.q_start_bits[index],
+                                           asym=True)
+                else:
+                    scale = [(g.max() - g.min()) / q_range for g in input_g]
+                    input_flat = [
+                        ((g - g.min()) / s).round().clamp(0,
+                                                          (q_range - 1)) * s + g.min()
+                        for g,
+                        s in zip(input_g,
+                                 scale)
+                    ]
+            else:
+                input_q = ds_quantizer(input.clone(),
+                                       self.q_groups,
+                                       self.q_start_bits[index],
+                                       asym=True)
+
+        if self.use_quantizer_kernel or (self.q_type and self.q_rounding):
+            return self.mixed_fp16_quantize(input, input_q, index)
+        else:
+            if self.q_mixed_fp16 and self.q_start_bits[index] >= (self.q_target_bits -
+                                                                  1):
+                input_flat = [(self.quantize_real_ratio * g) +
+                              ((1 - self.quantize_real_ratio) * g_q) for g,
+                              g_q in zip(input_g,
+                                         input_flat)]
+            input_q = torch.cat(input_flat)
+            input_q = input_q.reshape(input.size())
+            return input_q
+
+    def update_fp16_ratio(self):
+        if self.q_mixed_fp16:
+            if self.quantize_real_ratio > 0:
+                self.quantize_real_ratio -= self.q_change_ratio
+            else:
+                self.quantize_real_ratio = 0.000
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
--- a/docs/README.md
+++ b/docs/README.md
@ -1,49 +1,49 @@
-# DeepSpeed Documentation
-
-This directory includes the source code for the website and documentation of DeepSpeed. The `code-docs/` directory is used to build [deepspeed.readthedocs.io](https://deepspeed.readthedocs.io/en/latest/).
-
-[deepspeed.ai](https://www.deepspeed.ai/) is the recommended way to read all DeepSpeed documentation. Directly viewing the Markdown files in this directory will not include images and other features.
-
-## Building the documentation locally
-You can serve the DeepSpeed website locally. This is especially useful for development.
-
-### Prerequisites
-The DeepSpeed website relies on [Jekyll](https://jekyllrb.com/). There are several [guides for installation](https://jekyllrb.com/docs/installation/). The instructions below assume you are in an Ubuntu environment and have been tested on WSL.
-
-First ensure that you have the necessary packages (e.g., `make` and `zlib`).
-```
-sudo apt-get install build-essential zlib1g-dev ruby-full
-```
-
-Add these lines to your `.bashrc` or equivalent to ensure you have permissions to install Ruby packages without `sudo`.
-```
-export GEM_HOME="$HOME/gems"
-export PATH="$HOME/gems/bin:$PATH"
-```
-Don't forget to `source ~/.bashrc` afterwards 😊.
-
-
-Now we can install Jekyll and [Bundler](https://bundler.io/):
-```
-gem install jekyll bundler
-```
-
-### Start a local webserver
-We now need to install the required Ruby packages for the website.
-
-**NOTE**: you should change to this folder (i.e., docs) before running the installation command to avoid this [error](https://stackoverflow.com/questions/10012181/bundle-install-returns-could-not-locate-gemfile/35157872):
-
-> Could not locate Gemfile
-
-**NOTE**: this step frequently hangs when connected to a VPN (including MSVPN). Simply disconnect for the package installation.
-
-
-```
-bundle install
-```
-
-You can now start a local webserver via:
-```
-bundle exec jekyll serve
-```
-The website should now be accessible at [http://localhost:4000](http://localhost:4000)
+# DeepSpeed Documentation
+
+This directory includes the source code for the website and documentation of DeepSpeed. The `code-docs/` directory is used to build [deepspeed.readthedocs.io](https://deepspeed.readthedocs.io/en/latest/).
+
+[deepspeed.ai](https://www.deepspeed.ai/) is the recommended way to read all DeepSpeed documentation. Directly viewing the Markdown files in this directory will not include images and other features.
+
+## Building the documentation locally
+You can serve the DeepSpeed website locally. This is especially useful for development.
+
+### Prerequisites
+The DeepSpeed website relies on [Jekyll](https://jekyllrb.com/). There are several [guides for installation](https://jekyllrb.com/docs/installation/). The instructions below assume you are in an Ubuntu environment and have been tested on WSL.
+
+First ensure that you have the necessary packages (e.g., `make` and `zlib`).
+```
+sudo apt-get install build-essential zlib1g-dev ruby-full
+```
+
+Add these lines to your `.bashrc` or equivalent to ensure you have permissions to install Ruby packages without `sudo`.
+```
+export GEM_HOME="$HOME/gems"
+export PATH="$HOME/gems/bin:$PATH"
+```
+Don't forget to `source ~/.bashrc` afterwards 😊.
+
+
+Now we can install Jekyll and [Bundler](https://bundler.io/):
+```
+gem install jekyll bundler
+```
+
+### Start a local webserver
+We now need to install the required Ruby packages for the website.
+
+**NOTE**: you should change to this folder (i.e., docs) before running the installation command to avoid this [error](https://stackoverflow.com/questions/10012181/bundle-install-returns-could-not-locate-gemfile/35157872):
+
+> Could not locate Gemfile
+
+**NOTE**: this step frequently hangs when connected to a VPN (including MSVPN). Simply disconnect for the package installation.
+
+
+```
+bundle install
+```
+
+You can now start a local webserver via:
+```
+bundle exec jekyll serve
+```
+The website should now be accessible at [http://localhost:4000](http://localhost:4000)
--- a/docs/_posts/2021-03-08-zero3-offload.md
+++ b/docs/_posts/2021-03-08-zero3-offload.md
@ -1,100 +1,100 @@
---
-layout: single
-title: "DeepSpeed ZeRO-3 Offload"
-excerpt: ""
-categories: news
-new_post: true
-date: 2021-03-08 00:00:00
---
-Today we are announcing the release of ZeRO-3 Offload, a highly efficient and easy to use implementation of ZeRO Stage 3 and ZeRO Offload combined, geared towards our continued goal of democratizing AI by making efficient large-scale DL training available to everyone.  The key benefits of ZeRO-3 Offload are:
-
-* Unprecedented memory efficiency to run very large models on a limited number of GPU resources - e.g., fine-tune models with over 40B parameters on a single GPU and over 2 Trillion parameters on 512 GPUs!
-* Extremely Easy to use:
-    * Scale to over a trillion parameters without the need to combine multiple parallelism techniques in complicated ways.
-    * For existing DeepSpeed users, turn on ZeRO-3 Offload with just a few flags in DeepSpeed Config file.
-* High-performance per-GPU throughput and super-linear scalability across GPUs for distributed training.  
-    * With 1 Trillion parameters, ZeRO-3 Offload sustains 25 PetaFlops in compute performance on 512 NVIDIA V100 GPUs, achieving 49 TFlops/GPU.
-    * Up to 2x improvement in throughput compared to ZeRO- 2 Offload on single GPU
-
-
-<h2> Overview of ZeRO family of technology </h2>
-
-The Zero Redundancy Optimizer (abbreviated ZeRO) is a family of memory optimization technologies for large-scale distributed deep learning. Unlike data parallelism (that is efficient but can only support a limited model size) or model parallelism (that can support larger model sizes but requires significant code refactoring while adding communication overhead that limits efficiency), ZeRO allows fitting larger models in memory without requiring code refactoring while remaining very efficient. ZeRO does so by eliminating the memory redundancy that is inherent in data parallelism while limiting the communication overhead to a minimum.
-ZeRO removes the memory redundancies across data-parallel processes by partitioning the three model states (optimizer states, gradients, and parameters) across data-parallel processes instead of replicating them. By doing this, it boosts memory efficiency compared to classic data-parallelism while retaining its computational granularity and communication efficiency.
-There are three stages in ZeRO corresponding to three model states, as shown in the Figure 1: the first stage (ZeRO-1) partitions only the optimizer states, the second stage (ZeRO-2) partitions both the optimizer states and the gradients and the final stage (ZeRO-3) partitions all three model states (for more details see the ZeRO [paper](https://arxiv.org/abs/1910.02054v3)).
-
-<a href="/assets/images/zero3-offload-memory-overview.png">
-<img src="/assets/images/zero3-offload-memory-overview.png">
-</a>
-Figure 1. Overview of ZeRO memory savings
-
-In addition to these three stages, ZeRO family of technology also consists of ZeRO-2 Offload. ZeRO-2 Offload is a heterogenous DL training technology that works in conjunction with ZeRO-2 to offload partitioned optimizer states and gradients to CPU memory. ZeRO-2 Offload offers the full memory advantage of ZeRO-2 even on a single GPU, while at the same time offering great scalability of ZeRO-2 on multi-GPU setup.  DeepSpeed library has been offering ZeRO-2 Offload since Sept 2020. For details, please see below:
-
-* ZeRO: [Stage 1 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Stage 2 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Tutorial](/tutorials/zero)
-* ZeRO-Offload: [Blog](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3), [Tutorials](/tutorials/zero-offload), [Paper link](https://arxiv.org/abs/2101.06840)
-
-<h2>ZeRO-3 Offload</h2>
-With today’s release of ZeRO-3 Offload, we are adding support for partitioning and offloading parameters in addition to optimizer states and gradients partitioning already supported by ZeRO-2 Offload in DeepSpeed. With parameter partitioning ZeRO-3 Offload implements the full set of features in the three stages of ZeRO, that allows for a linear growth in model size with the number of GPUs. In addition, ZeRO-3 Offload can also optionally offload all these model states to CPU to further reduce GPU memory consumption, leveraging both CPU and GPU to maximize memory and compute efficiency of the entire system.
-
-We believe ZeRO-3 Offload offers a massive leap for large model training, in three regards:
-
-i) Unprecedented model scale,
-
-ii) Ease of supporting very-large models, and
-
-iii) Achieving excellent training efficiency.
-
-
-<h2>Unprecedented model scale</h2>
-Unlike ZeRO-2 and ZeRO-Offload where the parameters have to fit in the memory of a single GPU, ZeRO-3 Offload can partition the parameters across GPUs, and offload them to CPU, supporting model sizes that are much larger than the memory on a single GPU. Furthermore, ZeRO-3 Offload goes beyond the state-of-the-art hybrid 3D-parallelism (data, model and pipeline parallelism combined). While 3D Parallelism is limited by the aggregate GPU memory, ZeRO-3 Offload can exploit both GPU and CPU memory, the latter of which is much larger and cheaper compared to GPU memory. This allows ZeRO-3 Offload to train larger model sizes with the given GPU and CPU resources than any other currently available technology.
-
-<i>Model Scale on Single GPU</i>: ZeRO-3 Offload can train models with over 40B parameters efficiently on a single GPU (e.g., 32GB V100 GPU + 1.5TB CPU memory). This is 3x larger than what is possible with ZeRO-2 Offload, the current state-of-the art.
-
-<i>Model Scale on Multi-GPUs</i>: With ZeRO-3 Offload you can train a trillion and two trillion parameter models on NVIDIA 32GB V100 DGX-2 cluster with 256 GPUs and 512 GPUs, respectively. In contrast, the state-of-art 3D Parallelism requires 800 GPUs, and 1600 GPUs, respectively, to fit the same sized models. This represents a 3x reduction in GPUs required to fit models with over a trillion parameters.
-
-<h2>Ease of supporting very large models</h2>
-From a system perspective, training models with hundreds of billions and trillions of parameters is extremely challenging. Data parallelism cannot scale the model size much further beyond a billion parameters, model parallelism (with tensor slicing) cannot be used to scale model size efficiently beyond a single node boundary due to massive communication overheads, and pipeline parallelism cannot scale beyond the number of layers available in a model, which limits both the model size and the number of GPUs that it can scale to.
-
-The only existing parallel technology available that can scale to over a trillion parameters on massively parallel GPU clusters is the [3D parallelism](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-0) that combines data, model and pipeline parallelism in complex ways. While such a system can be very efficient, it requires major model code refactoring from data scientists to split the model into load balanced pipeline stages. This also makes 3D parallelism inflexible in the type of models that it can support, since models with complex dependency graphs cannot be easily converted into a load balanced pipeline.
-
-ZeRO-3 Offload address these challenges in two ways:
-
-i) With ground-breaking memory efficiency, ZeRO-3 and ZeRO-3 Offload are the only DL parallel technology that can efficiently scale to over a trillion parameters by itself, without requiring a hybrid parallelism strategy, greatly simplifying the system stack for DL training.
-
-ii) ZeRO-3 Offload requires virtually no model refactoring from model scientists, liberating data scientists to scale up complex models to hundreds of billions to trillions of parameters.
-
-<h2>Excellent training efficiency</h2>
-<i>High-performance per-GPU throughput on multiple nodes</i>: ZeRO-3 Offload offers excellent training efficiency for multi-billion and trillion parameter models on multiple nodes. It achieves a sustained throughput of up to 50 Tflops per GPU running on 32 DGX2 nodes comprising 512 NVIDIA V100 GPUs (see Figure 2). In comparison, the standard data parallel training with PyTorch can only achieve 30 TFlops per GPU for a 1.2B parameter model, the largest model that can be trained using data parallelism alone.  
-
-<a href="/assets/images/zero3-offload-512-v100.png">
-<img src="/assets/images/zero3-offload-512-v100.png">
-</a>
-Figure 2. ZeRO-3 Offload: Multi-billion and trillion parameter model throughput on 512 V100 GPUs
-
-ZeRO-3 Offload obtains high efficiency despite the 50% communication overhead of ZeRO Stage 3 compared to standard data parallel training for a fixed batch size. This is made possible through a communication overlap centric design and implementation, which allows ZeRO-3 Offload to hide nearly all of the communication volume with computation, while taking advantage of a larger batch size for improved efficiency resulting from better GPU memory efficiency.
-
-
-<i>Efficient multi-billion parameter model training on a single GPU</i>: ZeRO-3 Offload further democratizes AI by enabling efficient training of multi-billion parameter models on a single GPU. For single GPU training, ZeRO-3 Offload provides benefits over ZeRO-2 Offload along two dimensions. First, ZeRO-3 Offload increases the size of models trainable on a single V100 from 13B to 40B. Second, for ZeRO-3 Offload provides speedups (e.g., 2.3X for 13B) compared to ZeRO-2 Offload for model sizes trainable by both solutions. These results are summarized in Figure 3.  
-
-<a href="/assets/images/zero3-offload-1-v100.png">
-<img src="/assets/images/zero3-offload-1-v100.png">
-</a>
-Figure 3. Multi-billion parameter model training on one V100 GPU
-
-<i>Super-Linear scalability across GPUs</i>: Additionally, ZeRO-3 Offload also preserves the super-linear scalability characteristics that we have demonstrated with all our previous ZeRO technologies (ZeRO Stage 1, ZeRO Stage 2 and ZeRO Offload). ZeRO-3 Offload can exploit the aggregate PCI-E bandwidth between GPU and CPU across all the GPUs in multi-GPU training configuration, and at the same time, it can also exploit the aggregate CPU compute across all the nodes. As a result, the CPU-GPU-CPU communication time as well as the optimizer update time decreases linearly with number of GPUs and nodes, respectively, allowing ZeRO-3 Offload to exhibit super-linear scaling (see Figure 4).
-
-<a href="/assets/images/zero3-offload-200B-scalability.png">
-<img src="/assets/images/zero3-offload-200B-scalability.png">
-</a>
-Figure 4. ZeRO-3 Offload Superlinear Scalability for a 200B parameter model.
-
-<h2>How to use ZeRO-3 Offload</h2>
-As with many other existing DeepSpeed features, once the user model has been converted to use DeepSpeed, enabling ZeRO-3 Offload is as easy as turning on a couple of flags in DeepSpeed Config file. Supporting advanced features like weight sharing, or enabling extremely large models that requires to be partitioned across GPUs/nodes to fit in GPU/CPU memory, can be done with just a couple of additional lines of code change using the ZeRO-3 Offload API.
-
-If you are already a DeepSpeed user, you can find our detailed tutorial on ZeRO-3 Offload below. If you are new to DeepSpeed, we recommend that you start at the getting started page before trying out our ZeRO-3 Offload Tutorial.
-
-* DeepSpeed: [Getting Started Page](/getting-started/)
-
-* ZeRO-3 Offload [Documentation](https://deepspeed.readthedocs.io/en/latest/zero3.html), [Tutorial](/tutorials/zero/#training-trillion-scale-models-with-zero-3-offload)
-
-The DeepSpeed Team is very excited to share ZeRO-3 Offload with the DL community.
+---
+layout: single
+title: "DeepSpeed ZeRO-3 Offload"
+excerpt: ""
+categories: news
+new_post: true
+date: 2021-03-08 00:00:00
+---
+Today we are announcing the release of ZeRO-3 Offload, a highly efficient and easy to use implementation of ZeRO Stage 3 and ZeRO Offload combined, geared towards our continued goal of democratizing AI by making efficient large-scale DL training available to everyone.  The key benefits of ZeRO-3 Offload are:
+
+* Unprecedented memory efficiency to run very large models on a limited number of GPU resources - e.g., fine-tune models with over 40B parameters on a single GPU and over 2 Trillion parameters on 512 GPUs!
+* Extremely Easy to use:
+    * Scale to over a trillion parameters without the need to combine multiple parallelism techniques in complicated ways.
+    * For existing DeepSpeed users, turn on ZeRO-3 Offload with just a few flags in DeepSpeed Config file.
+* High-performance per-GPU throughput and super-linear scalability across GPUs for distributed training.  
+    * With 1 Trillion parameters, ZeRO-3 Offload sustains 25 PetaFlops in compute performance on 512 NVIDIA V100 GPUs, achieving 49 TFlops/GPU.
+    * Up to 2x improvement in throughput compared to ZeRO- 2 Offload on single GPU
+
+
+<h2> Overview of ZeRO family of technology </h2>
+
+The Zero Redundancy Optimizer (abbreviated ZeRO) is a family of memory optimization technologies for large-scale distributed deep learning. Unlike data parallelism (that is efficient but can only support a limited model size) or model parallelism (that can support larger model sizes but requires significant code refactoring while adding communication overhead that limits efficiency), ZeRO allows fitting larger models in memory without requiring code refactoring while remaining very efficient. ZeRO does so by eliminating the memory redundancy that is inherent in data parallelism while limiting the communication overhead to a minimum.
+ZeRO removes the memory redundancies across data-parallel processes by partitioning the three model states (optimizer states, gradients, and parameters) across data-parallel processes instead of replicating them. By doing this, it boosts memory efficiency compared to classic data-parallelism while retaining its computational granularity and communication efficiency.
+There are three stages in ZeRO corresponding to three model states, as shown in the Figure 1: the first stage (ZeRO-1) partitions only the optimizer states, the second stage (ZeRO-2) partitions both the optimizer states and the gradients and the final stage (ZeRO-3) partitions all three model states (for more details see the ZeRO [paper](https://arxiv.org/abs/1910.02054v3)).
+
+<a href="/assets/images/zero3-offload-memory-overview.png">
+<img src="/assets/images/zero3-offload-memory-overview.png">
+</a>
+Figure 1. Overview of ZeRO memory savings
+
+In addition to these three stages, ZeRO family of technology also consists of ZeRO-2 Offload. ZeRO-2 Offload is a heterogenous DL training technology that works in conjunction with ZeRO-2 to offload partitioned optimizer states and gradients to CPU memory. ZeRO-2 Offload offers the full memory advantage of ZeRO-2 even on a single GPU, while at the same time offering great scalability of ZeRO-2 on multi-GPU setup.  DeepSpeed library has been offering ZeRO-2 Offload since Sept 2020. For details, please see below:
+
+* ZeRO: [Stage 1 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Stage 2 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Tutorial](/tutorials/zero)
+* ZeRO-Offload: [Blog](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3), [Tutorials](/tutorials/zero-offload), [Paper link](https://arxiv.org/abs/2101.06840)
+
+<h2>ZeRO-3 Offload</h2>
+With today’s release of ZeRO-3 Offload, we are adding support for partitioning and offloading parameters in addition to optimizer states and gradients partitioning already supported by ZeRO-2 Offload in DeepSpeed. With parameter partitioning ZeRO-3 Offload implements the full set of features in the three stages of ZeRO, that allows for a linear growth in model size with the number of GPUs. In addition, ZeRO-3 Offload can also optionally offload all these model states to CPU to further reduce GPU memory consumption, leveraging both CPU and GPU to maximize memory and compute efficiency of the entire system.
+
+We believe ZeRO-3 Offload offers a massive leap for large model training, in three regards:
+
+i) Unprecedented model scale,
+
+ii) Ease of supporting very-large models, and
+
+iii) Achieving excellent training efficiency.
+
+
+<h2>Unprecedented model scale</h2>
+Unlike ZeRO-2 and ZeRO-Offload where the parameters have to fit in the memory of a single GPU, ZeRO-3 Offload can partition the parameters across GPUs, and offload them to CPU, supporting model sizes that are much larger than the memory on a single GPU. Furthermore, ZeRO-3 Offload goes beyond the state-of-the-art hybrid 3D-parallelism (data, model and pipeline parallelism combined). While 3D Parallelism is limited by the aggregate GPU memory, ZeRO-3 Offload can exploit both GPU and CPU memory, the latter of which is much larger and cheaper compared to GPU memory. This allows ZeRO-3 Offload to train larger model sizes with the given GPU and CPU resources than any other currently available technology.
+
+<i>Model Scale on Single GPU</i>: ZeRO-3 Offload can train models with over 40B parameters efficiently on a single GPU (e.g., 32GB V100 GPU + 1.5TB CPU memory). This is 3x larger than what is possible with ZeRO-2 Offload, the current state-of-the art.
+
+<i>Model Scale on Multi-GPUs</i>: With ZeRO-3 Offload you can train a trillion and two trillion parameter models on NVIDIA 32GB V100 DGX-2 cluster with 256 GPUs and 512 GPUs, respectively. In contrast, the state-of-art 3D Parallelism requires 800 GPUs, and 1600 GPUs, respectively, to fit the same sized models. This represents a 3x reduction in GPUs required to fit models with over a trillion parameters.
+
+<h2>Ease of supporting very large models</h2>
+From a system perspective, training models with hundreds of billions and trillions of parameters is extremely challenging. Data parallelism cannot scale the model size much further beyond a billion parameters, model parallelism (with tensor slicing) cannot be used to scale model size efficiently beyond a single node boundary due to massive communication overheads, and pipeline parallelism cannot scale beyond the number of layers available in a model, which limits both the model size and the number of GPUs that it can scale to.
+
+The only existing parallel technology available that can scale to over a trillion parameters on massively parallel GPU clusters is the [3D parallelism](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-0) that combines data, model and pipeline parallelism in complex ways. While such a system can be very efficient, it requires major model code refactoring from data scientists to split the model into load balanced pipeline stages. This also makes 3D parallelism inflexible in the type of models that it can support, since models with complex dependency graphs cannot be easily converted into a load balanced pipeline.
+
+ZeRO-3 Offload address these challenges in two ways:
+
+i) With ground-breaking memory efficiency, ZeRO-3 and ZeRO-3 Offload are the only DL parallel technology that can efficiently scale to over a trillion parameters by itself, without requiring a hybrid parallelism strategy, greatly simplifying the system stack for DL training.
+
+ii) ZeRO-3 Offload requires virtually no model refactoring from model scientists, liberating data scientists to scale up complex models to hundreds of billions to trillions of parameters.
+
+<h2>Excellent training efficiency</h2>
+<i>High-performance per-GPU throughput on multiple nodes</i>: ZeRO-3 Offload offers excellent training efficiency for multi-billion and trillion parameter models on multiple nodes. It achieves a sustained throughput of up to 50 Tflops per GPU running on 32 DGX2 nodes comprising 512 NVIDIA V100 GPUs (see Figure 2). In comparison, the standard data parallel training with PyTorch can only achieve 30 TFlops per GPU for a 1.2B parameter model, the largest model that can be trained using data parallelism alone.  
+
+<a href="/assets/images/zero3-offload-512-v100.png">
+<img src="/assets/images/zero3-offload-512-v100.png">
+</a>
+Figure 2. ZeRO-3 Offload: Multi-billion and trillion parameter model throughput on 512 V100 GPUs
+
+ZeRO-3 Offload obtains high efficiency despite the 50% communication overhead of ZeRO Stage 3 compared to standard data parallel training for a fixed batch size. This is made possible through a communication overlap centric design and implementation, which allows ZeRO-3 Offload to hide nearly all of the communication volume with computation, while taking advantage of a larger batch size for improved efficiency resulting from better GPU memory efficiency.
+
+
+<i>Efficient multi-billion parameter model training on a single GPU</i>: ZeRO-3 Offload further democratizes AI by enabling efficient training of multi-billion parameter models on a single GPU. For single GPU training, ZeRO-3 Offload provides benefits over ZeRO-2 Offload along two dimensions. First, ZeRO-3 Offload increases the size of models trainable on a single V100 from 13B to 40B. Second, for ZeRO-3 Offload provides speedups (e.g., 2.3X for 13B) compared to ZeRO-2 Offload for model sizes trainable by both solutions. These results are summarized in Figure 3.  
+
+<a href="/assets/images/zero3-offload-1-v100.png">
+<img src="/assets/images/zero3-offload-1-v100.png">
+</a>
+Figure 3. Multi-billion parameter model training on one V100 GPU
+
+<i>Super-Linear scalability across GPUs</i>: Additionally, ZeRO-3 Offload also preserves the super-linear scalability characteristics that we have demonstrated with all our previous ZeRO technologies (ZeRO Stage 1, ZeRO Stage 2 and ZeRO Offload). ZeRO-3 Offload can exploit the aggregate PCI-E bandwidth between GPU and CPU across all the GPUs in multi-GPU training configuration, and at the same time, it can also exploit the aggregate CPU compute across all the nodes. As a result, the CPU-GPU-CPU communication time as well as the optimizer update time decreases linearly with number of GPUs and nodes, respectively, allowing ZeRO-3 Offload to exhibit super-linear scaling (see Figure 4).
+
+<a href="/assets/images/zero3-offload-200B-scalability.png">
+<img src="/assets/images/zero3-offload-200B-scalability.png">
+</a>
+Figure 4. ZeRO-3 Offload Superlinear Scalability for a 200B parameter model.
+
+<h2>How to use ZeRO-3 Offload</h2>
+As with many other existing DeepSpeed features, once the user model has been converted to use DeepSpeed, enabling ZeRO-3 Offload is as easy as turning on a couple of flags in DeepSpeed Config file. Supporting advanced features like weight sharing, or enabling extremely large models that requires to be partitioned across GPUs/nodes to fit in GPU/CPU memory, can be done with just a couple of additional lines of code change using the ZeRO-3 Offload API.
+
+If you are already a DeepSpeed user, you can find our detailed tutorial on ZeRO-3 Offload below. If you are new to DeepSpeed, we recommend that you start at the getting started page before trying out our ZeRO-3 Offload Tutorial.
+
+* DeepSpeed: [Getting Started Page](/getting-started/)
+
+* ZeRO-3 Offload [Documentation](https://deepspeed.readthedocs.io/en/latest/zero3.html), [Tutorial](/tutorials/zero/#training-trillion-scale-models-with-zero-3-offload)
+
+The DeepSpeed Team is very excited to share ZeRO-3 Offload with the DL community.
--- a/docs/_posts/2021-05-05-inference-kernel-optimization.md
+++ b/docs/_posts/2021-05-05-inference-kernel-optimization.md
@ -1,73 +1,73 @@
---
-layout: single
-title: "DeepSpeed Inference: Multi-GPU inference with customized inference kernels and quantization support"
-excerpt: ""
-categories: news
-new_post: false
-date: 2021-03-16 00:00:00
---
-While DeepSpeed supports training advanced large-scale models, using these trained models in the desired application scenarios is still challenging due to three major limitations in existing inference solutions: 1) lack of support for multi-GPU inference to fit large models and meet latency requirements, 2) limited GPU kernel performance when running inference with small batch sizes, and 3) difficulties in exploiting quantization, which includes both quantizing the model to reduce the model size and latency as well as supporting high-performance inference of quantized models without specialized hardware.
-
-To handle these challenges, we introduce DeepSpeed Inference, which seamlessly adds high-performance inference support to large models trained in DeepSpeed with three key features:  inference-adapted parallelism for multi-GPU inference, inference-optimized kernels tuned for small batch sizes, and flexible support for quantize-aware training and inference kernels for quantized models.
-
-## Multi-GPU Inference with Adaptive Parallelism
-
-Parallelism is an effective approach to fit large models and reduce per-device memory consumption for both training and inference. However, simply applying training parallelism choices and degree to inference does not work well. The MP and PP configuration is normally set during the model training, apart from the data parallelism (DP), based on the memory footprint and computation style, and resource budget. On one hand, inference computation intrinsically requires less memory, so it can afford a larger partition per device. It helps reduce the degree of parallelism needed for model deployment. On the other hand, optimizing latency or meeting latency requirements is often a first-class citizen in inference while training optimizes throughput.
-
-To obtain desired latency, DeepSpeed Inference automatically adapts MP as an effective approach to reduce model latency, and its parallelism degree is often determined first. With MP, we can split the mode and parallelize computational operations across multiple devices (GPUs) to reduce latency, but it reduces computation granularity and increases communication that may hurt throughput. Once the latency target has been met, DeepSpeed can apply pipeline parallelism to maximize the throughput. Overall, DeepSpeed Inference supports flexible adaptation of both parallelism approach and degree choices from training to inference, minimizing latency while saving deployment costs.
-
-
-## Customized Inference Kernels for Boosted Compute Efficiency of Transformer Blocks
-
-To achieve high compute efficiency, DeepSpeed-inference offers inference kernels tailored for Transformer blocks through operator fusion, taking model-parallelism for multi-GPU into account. The main difference between our kernel-fusion scheme and similar approaches is that we not only fuse element-wise operations (such as bias-add, residual, and activation function), but also merge the General matrix multiply (GeMM) operations with other operations. To do this, we design an efficient implementation for the vector-matrix or skinny matrix-matrix multiplication that allows us to fuse more operations at the reduction boundary of GeMM operations.
-
-# Kernel-Fusion
-
-We take two main policies for fusing operations: 1) keeping the access-pattern of inputs and outputs intact throughout the sequence of operations fused together; 2) fusing operations at each all-reduce boundary. The first policy ensures that different thread-blocks won’t encounter transferring data between Streaming-Multiprocessors (SMs). This is due to no straight-forward communication among SMs other than using the main memory which adds the block-synching overhead because of non-deterministic behavior of memory access. The reason behind the second policy is that we cannot continue the execution unless the partial results are reduced among the model-parallel GPUs.
-
-![Inference-Kernel-Fusion](/assets/images/inference-kernel-fusion.png){: .align-center}
-
-Figure 1: Transformer Layer with Megatron-style model-parallelism all-reduce components. The figure illustrates the parts of layer fused together with broken lines (width of line shows the fusion depth).
-
-Figure 1 shows the different components of a Transformer layer, and the groups of operations considered for fusion in our inference optimization. We also consider the NVIDIA Megatron-LM style of parallelism that partitions attention (Attn) and feed-forward (FF) blocks across multiple GPUs. Thus, we include the two all-reduce operations that reduce the results among parallel GPUs after Attn and FF blocks. As Figure 1 shows, we fuse the operations inside a Transformer layer at four main regions:
-1.	Input Layer-Norm plus Query, Key, and Value GeMMs and their bias adds.
-2.	Transform plus Attention.
-3.  Intermediate FF, Layer-Norm, Bias-add, Residual, and Gaussian Error Linear Unit (GELU).
-4.	Bias-add plus Residual.
-
-To fuse these operations, we exploit shared-memory as an intermediate cache for transferring data between reduction operations used in layer-norm and GeMM, and the element-wise operations. Moreover, we use the warp-level instructions to communicate data between threads when reducing partial computations. In addition, we use a new schedule for GeMM operations, which allows for fusing as many operations as needed for the third kernel-fusion. We also combine the GeMMs for the attention computation in the second kernel-fusion, by using an implicit matrix transformation in order to reduce the memory pressure. Compared to the unfused computation style using cuBLAS GeMM, we improve the performance by 1.5x, 2.9x. 3x, and 1.2x for all these kernel-fusions, respectively.
-
-## Seamless pipeline from training to inference with automatic kernel-injection
-
-To run the model in Inference mode, DeepSpeed simply requires the location of the model checkpoints and the desired parallelism configuration, i.e., MP/PP degree. DeepSpeed Inference kernels can also be enabled for many well-known model architectures such as HuggingFace (Bert and GPT-2) or Megatron GPT-based models using a pre-defined policy map that maps the original parameters to the parameters in the inference kernels. For other transformer-based models, user can specify their own policy map. Note that DS-Inference can run independent of the training pipeline as long as it receives all model checkpoints, and the DeepSpeed Transformer kernels for inference can be injected into any Transformer model if the right mapping policy is defined. For more information on how to enable Transformer inference kernel as well as specifying parallelism, please refer to out [inference tutorial](https://www.deepspeed.ai/tutorials/inference-tutorial/).
-
-
-## Flexible quantization support
-
-To further reduce the inference cost for large-scale models, we created the DeepSpeed Quantization Toolkit, supporting flexible quantize-aware training and high-performance kernels for quantized inference.
-
-For training, we introduce a novel approach called Mixture of Quantization (MoQ), which is inspired by mixed-precision training while seamlessly applying quantization. With MoQ, we can control the precision of the model by simulating the impact of quantization when updating the parameters at each step of training. Moreover, it supports flexible quantization policies and schedules—we find that by dynamically adjusting the number of quantization bits during training, the final quantized model provides higher accuracy under the same compression ratio. To adapt to different tasks, MoQ can also leverage the second order information of models to detect their sensitivity to precision and adjust the quantization schedule and target accordingly.  
-
-To maximize the performance gains from the quantization model, we provide inference kernels tailored for quantized models that reduce latency through optimizing data movement but do not require specialized hardware. Finally, our toolkit does not require any code changes on the client side, making it easy to use.
-
-## Performance results
-
-Boosting throughput and reducing inference cost.  Figure 3 shows the inference throughput per GPU for the three model sizes corresponding to the three Transformer networks, GPT-2, Turing-NLG, and GPT-3. DeepSpeed Inference increases in per-GPU throughput by 2 to 4 times when using the same precision of FP16 as the baseline.  By enabling quantization, we boost throughput further. We reach a throughput improvement of 3x for GPT-2, 5x for Turing-NLG, and 3x for a model that is similar in characteristics and size to GPT-3, which directly translates to 3–5x inference cost reduction on serving these large models. In addition, we achieve these throughput and cost improvements without compromising latency as shown in Figure 5.  
-
-![Inference-Throughput](/assets/images/inference-throughput.png){: .align-center}
-
-Figure 3: Inference throughput for different model sizes. DeepSpeed Inference achieves 3x to 5x higher throughput than baseline.
-
-One source of inference cost reduction is through reducing the number of GPUs for hosting large models as shown in Figure 4.  The optimized GPU resources comes from 1) using inference-adapted parallelism, allowing users to adjust the model and pipeline parallelism degree from the trained model checkpoints, and 2) shrinking model memory footprint by half with INT8 quantization.  As shown in this figure, we use 2x less GPUs to run inference for the 17B model size by adapting the parallelism.  Together with INT8 quantization through DeepSpeed MoQ, we use 4x and 2x fewer GPUs for 17B and 175B sizes respectively.  
-
-![Inference-Throughput](/assets/images/gpu-numbers.png){: .align-center}
-
-Figure 4: Number of GPUs used for running inference on the different model sizes shown in Figure 4.
-
-Reducing inference latency.  For the application scenarios where inference latency is critical, we can increase model parallelism degree in DeepSpeed Inference to reduce inference latency further.  As Figure 5 depicts, we can reduce the latency by 2.3x compared to PyTorch as we increase the model-parallelism size to 4.  Furthermore, we can still have high latency improvement with a fewer number of GPUs by adapting the parallelism at inference and using MoQ to quantize the model. We obtain 1.3x and 1.9x speedups while using 4x and 2x lower resources than baseline, respectively.
-
-For the application scenarios where inference latency is critical, we can increase model parallelism degree in DeepSpeed Inference to reduce inference latency further.  As Figure 5 depicts, we can reduce the latency by 2.3x compared to PyTorch as we increase the model-parallelism size to 4.  Furthermore, we can still have high latency improvement with a fewer number of GPUs by adapting the parallelism at inference and using MoQ to quantize the model. We obtain 1.3x and 1.9x speedups while using 4x and 2x lower resources than baseline, respectively.
-
-![Inference-Throughput](/assets/images/inference-latency.png){: .align-center}
-
-Figure 5. Inference latency for the 17B model using different parallelism configuration to optimize latency.
+---
+layout: single
+title: "DeepSpeed Inference: Multi-GPU inference with customized inference kernels and quantization support"
+excerpt: ""
+categories: news
+new_post: false
+date: 2021-03-16 00:00:00
+---
+While DeepSpeed supports training advanced large-scale models, using these trained models in the desired application scenarios is still challenging due to three major limitations in existing inference solutions: 1) lack of support for multi-GPU inference to fit large models and meet latency requirements, 2) limited GPU kernel performance when running inference with small batch sizes, and 3) difficulties in exploiting quantization, which includes both quantizing the model to reduce the model size and latency as well as supporting high-performance inference of quantized models without specialized hardware.
+
+To handle these challenges, we introduce DeepSpeed Inference, which seamlessly adds high-performance inference support to large models trained in DeepSpeed with three key features:  inference-adapted parallelism for multi-GPU inference, inference-optimized kernels tuned for small batch sizes, and flexible support for quantize-aware training and inference kernels for quantized models.
+
+## Multi-GPU Inference with Adaptive Parallelism
+
+Parallelism is an effective approach to fit large models and reduce per-device memory consumption for both training and inference. However, simply applying training parallelism choices and degree to inference does not work well. The MP and PP configuration is normally set during the model training, apart from the data parallelism (DP), based on the memory footprint and computation style, and resource budget. On one hand, inference computation intrinsically requires less memory, so it can afford a larger partition per device. It helps reduce the degree of parallelism needed for model deployment. On the other hand, optimizing latency or meeting latency requirements is often a first-class citizen in inference while training optimizes throughput.
+
+To obtain desired latency, DeepSpeed Inference automatically adapts MP as an effective approach to reduce model latency, and its parallelism degree is often determined first. With MP, we can split the mode and parallelize computational operations across multiple devices (GPUs) to reduce latency, but it reduces computation granularity and increases communication that may hurt throughput. Once the latency target has been met, DeepSpeed can apply pipeline parallelism to maximize the throughput. Overall, DeepSpeed Inference supports flexible adaptation of both parallelism approach and degree choices from training to inference, minimizing latency while saving deployment costs.
+
+
+## Customized Inference Kernels for Boosted Compute Efficiency of Transformer Blocks
+
+To achieve high compute efficiency, DeepSpeed-inference offers inference kernels tailored for Transformer blocks through operator fusion, taking model-parallelism for multi-GPU into account. The main difference between our kernel-fusion scheme and similar approaches is that we not only fuse element-wise operations (such as bias-add, residual, and activation function), but also merge the General matrix multiply (GeMM) operations with other operations. To do this, we design an efficient implementation for the vector-matrix or skinny matrix-matrix multiplication that allows us to fuse more operations at the reduction boundary of GeMM operations.
+
+# Kernel-Fusion
+
+We take two main policies for fusing operations: 1) keeping the access-pattern of inputs and outputs intact throughout the sequence of operations fused together; 2) fusing operations at each all-reduce boundary. The first policy ensures that different thread-blocks won’t encounter transferring data between Streaming-Multiprocessors (SMs). This is due to no straight-forward communication among SMs other than using the main memory which adds the block-synching overhead because of non-deterministic behavior of memory access. The reason behind the second policy is that we cannot continue the execution unless the partial results are reduced among the model-parallel GPUs.
+
+![Inference-Kernel-Fusion](/assets/images/inference-kernel-fusion.png){: .align-center}
+
+Figure 1: Transformer Layer with Megatron-style model-parallelism all-reduce components. The figure illustrates the parts of layer fused together with broken lines (width of line shows the fusion depth).
+
+Figure 1 shows the different components of a Transformer layer, and the groups of operations considered for fusion in our inference optimization. We also consider the NVIDIA Megatron-LM style of parallelism that partitions attention (Attn) and feed-forward (FF) blocks across multiple GPUs. Thus, we include the two all-reduce operations that reduce the results among parallel GPUs after Attn and FF blocks. As Figure 1 shows, we fuse the operations inside a Transformer layer at four main regions:
+1.	Input Layer-Norm plus Query, Key, and Value GeMMs and their bias adds.
+2.	Transform plus Attention.
+3.  Intermediate FF, Layer-Norm, Bias-add, Residual, and Gaussian Error Linear Unit (GELU).
+4.	Bias-add plus Residual.
+
+To fuse these operations, we exploit shared-memory as an intermediate cache for transferring data between reduction operations used in layer-norm and GeMM, and the element-wise operations. Moreover, we use the warp-level instructions to communicate data between threads when reducing partial computations. In addition, we use a new schedule for GeMM operations, which allows for fusing as many operations as needed for the third kernel-fusion. We also combine the GeMMs for the attention computation in the second kernel-fusion, by using an implicit matrix transformation in order to reduce the memory pressure. Compared to the unfused computation style using cuBLAS GeMM, we improve the performance by 1.5x, 2.9x. 3x, and 1.2x for all these kernel-fusions, respectively.
+
+## Seamless pipeline from training to inference with automatic kernel-injection
+
+To run the model in Inference mode, DeepSpeed simply requires the location of the model checkpoints and the desired parallelism configuration, i.e., MP/PP degree. DeepSpeed Inference kernels can also be enabled for many well-known model architectures such as HuggingFace (Bert and GPT-2) or Megatron GPT-based models using a pre-defined policy map that maps the original parameters to the parameters in the inference kernels. For other transformer-based models, user can specify their own policy map. Note that DS-Inference can run independent of the training pipeline as long as it receives all model checkpoints, and the DeepSpeed Transformer kernels for inference can be injected into any Transformer model if the right mapping policy is defined. For more information on how to enable Transformer inference kernel as well as specifying parallelism, please refer to out [inference tutorial](https://www.deepspeed.ai/tutorials/inference-tutorial/).
+
+
+## Flexible quantization support
+
+To further reduce the inference cost for large-scale models, we created the DeepSpeed Quantization Toolkit, supporting flexible quantize-aware training and high-performance kernels for quantized inference.
+
+For training, we introduce a novel approach called Mixture of Quantization (MoQ), which is inspired by mixed-precision training while seamlessly applying quantization. With MoQ, we can control the precision of the model by simulating the impact of quantization when updating the parameters at each step of training. Moreover, it supports flexible quantization policies and schedules—we find that by dynamically adjusting the number of quantization bits during training, the final quantized model provides higher accuracy under the same compression ratio. To adapt to different tasks, MoQ can also leverage the second order information of models to detect their sensitivity to precision and adjust the quantization schedule and target accordingly.  
+
+To maximize the performance gains from the quantization model, we provide inference kernels tailored for quantized models that reduce latency through optimizing data movement but do not require specialized hardware. Finally, our toolkit does not require any code changes on the client side, making it easy to use.
+
+## Performance results
+
+Boosting throughput and reducing inference cost.  Figure 3 shows the inference throughput per GPU for the three model sizes corresponding to the three Transformer networks, GPT-2, Turing-NLG, and GPT-3. DeepSpeed Inference increases in per-GPU throughput by 2 to 4 times when using the same precision of FP16 as the baseline.  By enabling quantization, we boost throughput further. We reach a throughput improvement of 3x for GPT-2, 5x for Turing-NLG, and 3x for a model that is similar in characteristics and size to GPT-3, which directly translates to 3–5x inference cost reduction on serving these large models. In addition, we achieve these throughput and cost improvements without compromising latency as shown in Figure 5.  
+
+![Inference-Throughput](/assets/images/inference-throughput.png){: .align-center}
+
+Figure 3: Inference throughput for different model sizes. DeepSpeed Inference achieves 3x to 5x higher throughput than baseline.
+
+One source of inference cost reduction is through reducing the number of GPUs for hosting large models as shown in Figure 4.  The optimized GPU resources comes from 1) using inference-adapted parallelism, allowing users to adjust the model and pipeline parallelism degree from the trained model checkpoints, and 2) shrinking model memory footprint by half with INT8 quantization.  As shown in this figure, we use 2x less GPUs to run inference for the 17B model size by adapting the parallelism.  Together with INT8 quantization through DeepSpeed MoQ, we use 4x and 2x fewer GPUs for 17B and 175B sizes respectively.  
+
+![Inference-Throughput](/assets/images/gpu-numbers.png){: .align-center}
+
+Figure 4: Number of GPUs used for running inference on the different model sizes shown in Figure 4.
+
+Reducing inference latency.  For the application scenarios where inference latency is critical, we can increase model parallelism degree in DeepSpeed Inference to reduce inference latency further.  As Figure 5 depicts, we can reduce the latency by 2.3x compared to PyTorch as we increase the model-parallelism size to 4.  Furthermore, we can still have high latency improvement with a fewer number of GPUs by adapting the parallelism at inference and using MoQ to quantize the model. We obtain 1.3x and 1.9x speedups while using 4x and 2x lower resources than baseline, respectively.
+
+For the application scenarios where inference latency is critical, we can increase model parallelism degree in DeepSpeed Inference to reduce inference latency further.  As Figure 5 depicts, we can reduce the latency by 2.3x compared to PyTorch as we increase the model-parallelism size to 4.  Furthermore, we can still have high latency improvement with a fewer number of GPUs by adapting the parallelism at inference and using MoQ to quantize the model. We obtain 1.3x and 1.9x speedups while using 4x and 2x lower resources than baseline, respectively.
+
+![Inference-Throughput](/assets/images/inference-latency.png){: .align-center}
+
+Figure 5. Inference latency for the 17B model using different parallelism configuration to optimize latency.
--- a/docs/_tutorials/mixture-of-experts.md
+++ b/docs/_tutorials/mixture-of-experts.md
@ -1,197 +1,197 @@
---
-title: "Mixture of Experts"
---
-
-DeepSpeed v0.5 introduces new support for training Mixture of Experts (MoE) models. MoE models are an emerging class of sparsely activated models that have sublinear compute costs with respect to their parameters. For example, the [Switch Transformer](https://arxiv.org/abs/2101.03961) consists of over 1.6 trillion parameters, while the compute required to train it is approximately equal to that of a 10 billion-parameter dense model. This increase in model size offers tremendous accuracy gains for a constant compute budget.
-
-For more details on results and further discussion, please see our press release: [DeepSpeed powers 8x larger MoE model training with high performance]({{ site.press_release_v5 }}).
-
-## Getting started with a simple MoE example
-
-**Note:** DeepSpeed MoE requires Pytorch 1.8 or above.
-{: .notice--info}
-
-As a simple starting point we will show how to apply DeepSpeed MoE to a cifar10 example. Please refer to
-our [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) going forward.
-
-If you are adding MoE to an existing model you can use the snippet below to help guide you:
-
-
-### Expert groups initialization
-
-DeepSpeed MoE supports five different forms of parallelism, and it exploits both GPU and CPU memory. Its flexible design enables users to mix different types of prevalent parallelism techniques, as shown in the table below.
-
-| Short Name       | Flexible Parallelism Configurations | Benefit                                                                     |
-| ---------------- | ------------------------------------| --------------------------------------------------------------------------- |
-| E                | Expert                              | Scales the model size by increasing the number of experts                   |
-| E + D            | Expert + Data                       | Accelerates training throughput by scaling to multiple data parallel groups |
-| E + Z            | Expert + ZeRO-powered data          | Partitions the nonexpert parameters to support larger base models           |
-| E + D + M        | Expert + Data + Model               | Supports massive hidden sizes and even larger base models than E+Z          |
-| E + D + Z        | Expert + Data + ZeRO-powered data   | Supports massive hidden sizes and even larger base models than E+Z          |
-| E + Z-Off + M    | Expert + ZeRO-Offload + Model       | Leverages both GPU and CPU memory for large MoE models on limited # of GPUs |
-
-To support different forms of parallelism, we create a notion of DeepSpeed process groups that resides in ```deepspeed.utils.groups.py```
-
-For most cases, the model training code needs to initialize these groups by calling
-```python
-deepspeed.utils.groups.initialize(ep_size="desired expert-parallel world size")
-```
-
-The GPUs (or ranks) participating in an expert-parallel group will distribute the total number of experts specified by the model training code argument num_experts.
-
-### MoE layer API
-
-The hidden_size is the input dimension of a particular layer and the output dimension is the same as that. This could lead to some changes to your model definition, especially for vision/convolutional models because the input/output dimensions don't match in certain cases. E.g. in the CIFAR-10 example, we modify the third fully connected layer to add the MoE layer. To cater for this, we need to add an additional fully-connected layer, whose input dimension is equal to the output dimension of the MoE layer.
-
-Original model config
-
-```python
-    self.fc3 = nn.Linear(84, 10)
-```
-
-Updated with MoE Layers
-
-```python
-    self.fc3 = nn.Linear(84, 84)
-    self.fc3 = deepspeed.moe.layer.MoE(hidden_size=84, expert=self.fc3, num_experts=args.num_experts, ...)
-    self.fc4 = nn.Linear(84, 10)
-```
-
-### An Example Scenario
-
-Given a total number of GPUs in our world size and a subset of GPUs in our expert-parallel world as follows.
-
-```python
-WORLD_SIZE = 4
-EP_WORLD_SIZE = 2
-EXPERTS = 8
-```
-
-The user code needs to initialize the groups as follows.
-
-```python
-groups.initialize (ep_size=EP_WORLD_SIZE)
-```
-
-After that, the model code needs to use the deepspeed.moe.layer.MoE API as follows.
-
-```python
-self.experts = deepspeed.moe.layer.MoE(hidden_size=input_dim, expert=ExpertModule(), num_experts=EXPERTS)
-```
-With the above two commands, the DeepSpeed runtime will be set to train an MoE model with a total of 8 experts on 4 GPUs in 4 experts/GPU mode. We call this the E + D mode as described earlier in the table.
-
-For more advanced use case of the groups API including the inter-operability with Megatron style mpu object, watch this space!
-
-
-```python
-import torch
-import deepspeed
-import deepspeed.utils.groups as groups
-from deepspeed.moe.layer import MoE
-
-WORLD_SIZE = 4
-EP_WORLD_SIZE = 2
-EXPERTS = 8
-
-groups.initialize(ep_size=EP_WORLD_SIZE)
-
-fc3 = torch.nn.Linear(84, 84)
-fc3 = MoE(hidden_size=84, expert=self.fc3, num_experts=EXPERTS, k=1)
-fc4 = torch.nn.Linear(84, 10)
-
-```
-
-For a runnable end-to-end example, please look at [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar)
-
-### Combining ZeRO-Offload and DeepSpeed MoE for very large models
-
-To use MoE Layers in DeepSpeed, we rely on two parameter groups that are passed to an optimizer. A concrete example to create such groups is available from the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar).
-
-The relevant function that creates these param groups is as follows.
-
-```python
-def create_moe_param_groups(model):
-    from deepspeed.moe.utils import is_moe_param
-
-    params_with_weight_decay = {'params': [], 'name': 'weight_decay_params'}
-    moe_params_with_weight_decay = {
-        'params': [],
-        'moe': True,
-        'name': 'weight_decay_moe_params'
-    }
-
-    for module_ in model.modules():
-        moe_params_with_weight_decay['params'].extend([
-            p for n, p in list(module_._parameters.items())
-            if p is not None and is_moe_param(p)
-        ])
-        params_with_weight_decay['params'].extend([
-            p for n, p in list(module_._parameters.items())
-            if p is not None and not is_moe_param(p)
-        ])
-
-    return params_with_weight_decay, moe_params_with_weight_decay
-```
-
-The above param groups can then be fed to the ZeRO stage-2 optimizer as follows.
-
-```python
-
-net = Net()
-
-parameters = create_moe_param_groups(net)
-
-model_engine, optimizer, trainloader, __ = deepspeed.initialize(
-    args=args, model=net, model_parameters=parameters, training_data=trainset)
-```
-
-We are working on automating this functionality in the DeepSpeed ZeRO optimizer so the model training code can be simplified further.
-
-To run the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) with ZeRO-Offload (stage 2) and MoE, please set the ds_config flags
-
-```json
-"zero_optimization": {
-      "stage": 2,
-      "allgather_partitions": true,
-      "reduce_scatter": true,
-      "allgather_bucket_size": 50000000,
-      "reduce_bucket_size": 50000000,
-      "overlap_comm": true,
-      "contiguous_gradients": true,
-      "cpu_offload": true
-  }
-```
-
-An additional optimization to save memory for extremely large model training on limited number of GPUs has also been introduced. Please enable that using the following config flag to the fp16 optimizer in ds_config.
-
-  ```json
-    "fp16": {
-      "enabled": true,
-      "fp16_master_weights_and_grads": true,
-  }
-  ```
-
-<!--
-
-
-hidden_size (int): the hidden dimension of the model.
-expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
-num_experts (int, optional): default=1, the total number of experts per layer.
-k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
-output_dropout_prob (float, optional): default=0.5, output dropout probability.
-capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
-eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
-min_capacity (int, optional): default=4, min number of tokens per expert.
-noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
-->
-
-
-
-
-## Random Token Selection
-
-We have devised a new technique called “Random Token Selection” that greatly improves convergence. Random token selection addresses the limitation of biased selection problem in MoE model training. Our upcoming paper describes this technique and its results in detail. This feature is already part of the DeepSpeed runtime and is enabled by default so users can take advantage without any config flags or command-line arguments.
-
-## Advanced MoE usage
-
-Watch this space! We plan to add more interesting and detailed examples of using DeepSpeed MoE in the coming weeks.
+---
+title: "Mixture of Experts"
+---
+
+DeepSpeed v0.5 introduces new support for training Mixture of Experts (MoE) models. MoE models are an emerging class of sparsely activated models that have sublinear compute costs with respect to their parameters. For example, the [Switch Transformer](https://arxiv.org/abs/2101.03961) consists of over 1.6 trillion parameters, while the compute required to train it is approximately equal to that of a 10 billion-parameter dense model. This increase in model size offers tremendous accuracy gains for a constant compute budget.
+
+For more details on results and further discussion, please see our press release: [DeepSpeed powers 8x larger MoE model training with high performance]({{ site.press_release_v5 }}).
+
+## Getting started with a simple MoE example
+
+**Note:** DeepSpeed MoE requires Pytorch 1.8 or above.
+{: .notice--info}
+
+As a simple starting point we will show how to apply DeepSpeed MoE to a cifar10 example. Please refer to
+our [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) going forward.
+
+If you are adding MoE to an existing model you can use the snippet below to help guide you:
+
+
+### Expert groups initialization
+
+DeepSpeed MoE supports five different forms of parallelism, and it exploits both GPU and CPU memory. Its flexible design enables users to mix different types of prevalent parallelism techniques, as shown in the table below.
+
+| Short Name       | Flexible Parallelism Configurations | Benefit                                                                     |
+| ---------------- | ------------------------------------| --------------------------------------------------------------------------- |
+| E                | Expert                              | Scales the model size by increasing the number of experts                   |
+| E + D            | Expert + Data                       | Accelerates training throughput by scaling to multiple data parallel groups |
+| E + Z            | Expert + ZeRO-powered data          | Partitions the nonexpert parameters to support larger base models           |
+| E + D + M        | Expert + Data + Model               | Supports massive hidden sizes and even larger base models than E+Z          |
+| E + D + Z        | Expert + Data + ZeRO-powered data   | Supports massive hidden sizes and even larger base models than E+Z          |
+| E + Z-Off + M    | Expert + ZeRO-Offload + Model       | Leverages both GPU and CPU memory for large MoE models on limited # of GPUs |
+
+To support different forms of parallelism, we create a notion of DeepSpeed process groups that resides in ```deepspeed.utils.groups.py```
+
+For most cases, the model training code needs to initialize these groups by calling
+```python
+deepspeed.utils.groups.initialize(ep_size="desired expert-parallel world size")
+```
+
+The GPUs (or ranks) participating in an expert-parallel group will distribute the total number of experts specified by the model training code argument num_experts.
+
+### MoE layer API
+
+The hidden_size is the input dimension of a particular layer and the output dimension is the same as that. This could lead to some changes to your model definition, especially for vision/convolutional models because the input/output dimensions don't match in certain cases. E.g. in the CIFAR-10 example, we modify the third fully connected layer to add the MoE layer. To cater for this, we need to add an additional fully-connected layer, whose input dimension is equal to the output dimension of the MoE layer.
+
+Original model config
+
+```python
+    self.fc3 = nn.Linear(84, 10)
+```
+
+Updated with MoE Layers
+
+```python
+    self.fc3 = nn.Linear(84, 84)
+    self.fc3 = deepspeed.moe.layer.MoE(hidden_size=84, expert=self.fc3, num_experts=args.num_experts, ...)
+    self.fc4 = nn.Linear(84, 10)
+```
+
+### An Example Scenario
+
+Given a total number of GPUs in our world size and a subset of GPUs in our expert-parallel world as follows.
+
+```python
+WORLD_SIZE = 4
+EP_WORLD_SIZE = 2
+EXPERTS = 8
+```
+
+The user code needs to initialize the groups as follows.
+
+```python
+groups.initialize (ep_size=EP_WORLD_SIZE)
+```
+
+After that, the model code needs to use the deepspeed.moe.layer.MoE API as follows.
+
+```python
+self.experts = deepspeed.moe.layer.MoE(hidden_size=input_dim, expert=ExpertModule(), num_experts=EXPERTS)
+```
+With the above two commands, the DeepSpeed runtime will be set to train an MoE model with a total of 8 experts on 4 GPUs in 4 experts/GPU mode. We call this the E + D mode as described earlier in the table.
+
+For more advanced use case of the groups API including the inter-operability with Megatron style mpu object, watch this space!
+
+
+```python
+import torch
+import deepspeed
+import deepspeed.utils.groups as groups
+from deepspeed.moe.layer import MoE
+
+WORLD_SIZE = 4
+EP_WORLD_SIZE = 2
+EXPERTS = 8
+
+groups.initialize(ep_size=EP_WORLD_SIZE)
+
+fc3 = torch.nn.Linear(84, 84)
+fc3 = MoE(hidden_size=84, expert=self.fc3, num_experts=EXPERTS, k=1)
+fc4 = torch.nn.Linear(84, 10)
+
+```
+
+For a runnable end-to-end example, please look at [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar)
+
+### Combining ZeRO-Offload and DeepSpeed MoE for very large models
+
+To use MoE Layers in DeepSpeed, we rely on two parameter groups that are passed to an optimizer. A concrete example to create such groups is available from the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar).
+
+The relevant function that creates these param groups is as follows.
+
+```python
+def create_moe_param_groups(model):
+    from deepspeed.moe.utils import is_moe_param
+
+    params_with_weight_decay = {'params': [], 'name': 'weight_decay_params'}
+    moe_params_with_weight_decay = {
+        'params': [],
+        'moe': True,
+        'name': 'weight_decay_moe_params'
+    }
+
+    for module_ in model.modules():
+        moe_params_with_weight_decay['params'].extend([
+            p for n, p in list(module_._parameters.items())
+            if p is not None and is_moe_param(p)
+        ])
+        params_with_weight_decay['params'].extend([
+            p for n, p in list(module_._parameters.items())
+            if p is not None and not is_moe_param(p)
+        ])
+
+    return params_with_weight_decay, moe_params_with_weight_decay
+```
+
+The above param groups can then be fed to the ZeRO stage-2 optimizer as follows.
+
+```python
+
+net = Net()
+
+parameters = create_moe_param_groups(net)
+
+model_engine, optimizer, trainloader, __ = deepspeed.initialize(
+    args=args, model=net, model_parameters=parameters, training_data=trainset)
+```
+
+We are working on automating this functionality in the DeepSpeed ZeRO optimizer so the model training code can be simplified further.
+
+To run the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) with ZeRO-Offload (stage 2) and MoE, please set the ds_config flags
+
+```json
+"zero_optimization": {
+      "stage": 2,
+      "allgather_partitions": true,
+      "reduce_scatter": true,
+      "allgather_bucket_size": 50000000,
+      "reduce_bucket_size": 50000000,
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "cpu_offload": true
+  }
+```
+
+An additional optimization to save memory for extremely large model training on limited number of GPUs has also been introduced. Please enable that using the following config flag to the fp16 optimizer in ds_config.
+
+  ```json
+    "fp16": {
+      "enabled": true,
+      "fp16_master_weights_and_grads": true,
+  }
+  ```
+
+<!--
+
+
+hidden_size (int): the hidden dimension of the model.
+expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
+num_experts (int, optional): default=1, the total number of experts per layer.
+k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
+output_dropout_prob (float, optional): default=0.5, output dropout probability.
+capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
+eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
+min_capacity (int, optional): default=4, min number of tokens per expert.
+noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
+-->
+
+
+
+
+## Random Token Selection
+
+We have devised a new technique called “Random Token Selection” that greatly improves convergence. Random token selection addresses the limitation of biased selection problem in MoE model training. Our upcoming paper describes this technique and its results in detail. This feature is already part of the DeepSpeed runtime and is enabled by default so users can take advantage without any config flags or command-line arguments.
+
+## Advanced MoE usage
+
+Watch this space! We plan to add more interesting and detailed examples of using DeepSpeed MoE in the coming weeks.
--- a/docs/_tutorials/progressive_layer_dropping.md
+++ b/docs/_tutorials/progressive_layer_dropping.md
@ -1,155 +1,155 @@
---
-title: "Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping"
-
---
-
-In this tutorial, we are going to introduce the progressive layer dropping (PLD) in DeepSpeed and provide examples on how to use PLD. PLD allows to train Transformer networks such as BERT 24% faster under the same number of samples and 2.5 times faster to get similar accuracy on downstream tasks. Detailed description of PLD and the experimental results are available in our [technical report](https://arxiv.org/pdf/2010.13369.pdf).
-
-To illustrate how to use PLD in DeepSpeed, we show how to enable PLD to pre-train a BERT model and fine-tune the pre-trained model on the GLUE datasets.
-
-## Running Pre-training with DeepSpeed and PLD
-
-To perform pre-training, one needs to first prepare the datasets. For this part, please refer our [BERT Pre-training](/tutorials/bert-pretraining/) post, which contains detailed information on how to do data downloading and pre-processing. For the below experiment, we use Wikipedia text and Bookcorpus, similar as [Devlin et. al.](https://arxiv.org/abs/1810.04805).
-
-The main part of pre-training is done in `deepspeed_train.py`, which has
-already been modified to use DeepSpeed. The  `ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh` is the shell script that launches the pre-training with DeepSpeed and PLD.
-
-```shell
-bash ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh
-```
-
-Most of the flags in the above script should be familiar if you have stepped through the BERT pre-training [tutorial](/tutorials/bert-pretraining/). To enable training with PLD, one needs to enable PLD in both the client script and in the DeepSpeed engine. To enable PLD in the client script, one needs to add the following command line flag to enable progressive layer dropping on Transformer blocks.
-
-```bash
--progressive_layer_drop
-```
-
-To enable PLD in DeepSpeed, one needs to update the json configuration file with an appropriate PLD configuration dictionary like below:
-
-```json
-{
-  ...
-  "progressive_layer_drop": {
-    "enabled": true,
-    "theta": 0.5,
-    "gamma": 0.001
-  }
-}
-```
-
-we recommend a PLD theta value of 0.5 and gamma of 0.001 because these have worked well in our experiments.
-
-With these configuration changes, the DeepSpeed engine should print a runtime message as below:
-
-    [INFO] [logging.py:60:log_dist] [Rank 0] Enabled progressive layer dropping (theta = 0.5)
-
-The `deepspeed_bsz4k_progressive_layer_drop_config_seq128.json` file allows users to specify DeepSpeed options in terms of batch size, micro batch size, optimizer, learning rate, sequence length, and other parameters. Below is the DeepSpeed configuration file we use for running BERT and PLD.
-
-```json
-{
-  "train_batch_size": 4096,
-  "train_micro_batch_size_per_gpu": 16,
-  "steps_per_print": 1000,
-  "prescale_gradients": true,
-  "gradient_predivide_factor": 8,
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 1e-3,
-      "weight_decay": 0.01,
-      "bias_correction": false
-    }
-  },
-  "gradient_clipping": 1.0,
-
-  "wall_clock_breakdown": false,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0
-  },
-
-  "progressive_layer_drop": {
-    "enabled": true,
-    "theta": 0.5,
-    "gamma": 0.001
-  }
-}
-```
-
-Note that the above configuration assumes training on 64 X 32GB V100 GPUs. Each GPU uses a micro batch size of 16 and accumulates gradients until the effective batch size reaches 4096. If you have GPUs with less memory, you may need to reduce "train_micro_batch_size_per_gpu". Alternatively, if you have more GPUs, you can increase the "train_batch_size" to increase training speed. We use the following hyperparameters for pre-training BERT with PLD enabled.
-
-| Parameters                     | Value                   |
-| ------------------------------ | ----------------------- |
-| Effective batch size           | 4K                      |
-| Train micro batch size per GPU | 16                      |
-| Optimizer                      | Adam                    |
-| Peak learning rate             | 1e-3                    |
-| Sequence-length                | 128                     |
-| Learning rate scheduler        | Warmup linear decay exp |
-| Warmup ratio                   | 0.02                    |
-| Decay rate                     | 0.99                    |
-| Decay step                     | 1000                    |
-| Weight decay                   | 0.01                    |
-| Gradient clipping              | 1.0                     |
-
-Table 1. Pre-training hyperparameters
-
-**Note:** DeepSpeed now supports PreLayerNorm as the default way for training BERT, because of its ability to avoid vanishing gradient, stabilize optimization, and performance gains, as described in our fastest BERT training [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html). We therefore support the switchable Transformer block directly on the the BERT with PreLayerNorm. The implementation can be found at "example\bing_bert\nvidia\modelingpreln_layerdrop.py".
-
-## Fine-tuning with DeepSpeed on GLUE Tasks
-
-We use GLUE for fine-tuning tasks. GLUE (General Language Understanding Evaluation benchmark) (https://gluebenchmark.com/) is a  collection of sentence or sentence-pair natural language understanding tasks including question answering, sentiment analysis, and textual entailment.  It is designed to favor sample-efficient learning and knowledge-transfer across a range of different linguistic tasks in different domains.
-
-One can download all GLUE data using the provided helper [script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e). Once the data has been downloaded, one can set up the data and move the data to "/data/GlueData", which is the default location for hosting GLUE data. We then can use the PLD pre-trained BERT model checkpoint to run the fine-tuning.
-
-The main part of fine-tuning is done in `run_glue_classifier_bert_base.py`, which has
-already been modified to use DeepSpeed. Before the fine-tuning, one needs to specify the BERT model configuration through the following config in `run_glue_classifier_bert_base.py`. In this case, it has already been modified to be the same as the configuration of the pre-trained model.
-
-```json
-    bert_model_config = {
-        "vocab_size_or_config_json_file": 119547,
-        "hidden_size": 768,
-        "num_hidden_layers": 12,
-        "num_attention_heads": 12,
-        "intermediate_size": 3072,
-        "hidden_act": "gelu",
-        "hidden_dropout_prob": 0.1,
-        "attention_probs_dropout_prob": 0.1,
-        "max_position_embeddings": 512,
-        "type_vocab_size": 2,
-        "initializer_range": 0.02
-    }
-```
-
-Next, one can load a DeepSpeed style checkpoint with the following command, which has also already been added in the script.
-
-```shell
-model.load_state_dict(checkpoint_state_dict['module'], strict=False)
-```
-
-Finally, the `run_glue_classifier_bert_base.sh` script invokes pre-training and setups several hyperparameters relevant to fine-tuning.
-
-```shell
-bash run_glue_bert_base_finetune.sh [task] [batch size] [learning rate] [number of epochs] [job name] [checkpoint path]
-```
-
-An example would be:
-
-```shell
-bash run_glue_bert_base_finetune.sh MNLI 32 3e-5 5 "fine_tune_MNLI" deepspeed_checkpoint.pt
-```
-
-
-
-### Expected Results
-
-The fine-tuning results can be found under the "logs" directory, and below are expected results for PLD on GLUE tasks. The "Lr" row indicates the learning rate we use for getting the corresponding accuracy result for each task.
-
-|                        | RTE  | MRPC      | STS-B     | CoLA | SST-2 | QNLI | QQP       | MNLI-m/mm | GLUE |
-| ---------------------- | :--: | --------- | --------- | ---- | ----- | ---- | --------- | --------- | ---- |
-| Metrics                | Acc. | F1/Acc.   | PCC/SCC   | Acc. | Acc.  | Acc. | F1/Acc.   | Acc.      |      |
-| Bert_{base} (original) | 66.4 | 88.9/84.8 | 87.1/89.2 | 52.1 | 93.5  | 90.5 | 71.2/89.2 | 84.6/83.4 | 80.7 |
-| Bert_{base} (Our impl) | 67.8 | 88.0/86.0 | 89.5/89.2 | 52.5 | 91.2  | 87.1 | 89.0/90.6 | 82.5/83.4 | 82.1 |
-| PLD                    | 69.3 | 86.6/84.3 | 90.0/89.6 | 55.8 | 91.6  | 90.7 | 89.6/91.2 | 84.1/83.8 | 82.9 |
-| Lr                     | 7e-5 | 9e-5      | 7e-5      | 5e-5 | 7e-5  | 9e-5 | 2e-4      | 3e-5      |      |
+---
+title: "Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping"
+
+---
+
+In this tutorial, we are going to introduce the progressive layer dropping (PLD) in DeepSpeed and provide examples on how to use PLD. PLD allows to train Transformer networks such as BERT 24% faster under the same number of samples and 2.5 times faster to get similar accuracy on downstream tasks. Detailed description of PLD and the experimental results are available in our [technical report](https://arxiv.org/pdf/2010.13369.pdf).
+
+To illustrate how to use PLD in DeepSpeed, we show how to enable PLD to pre-train a BERT model and fine-tune the pre-trained model on the GLUE datasets.
+
+## Running Pre-training with DeepSpeed and PLD
+
+To perform pre-training, one needs to first prepare the datasets. For this part, please refer our [BERT Pre-training](/tutorials/bert-pretraining/) post, which contains detailed information on how to do data downloading and pre-processing. For the below experiment, we use Wikipedia text and Bookcorpus, similar as [Devlin et. al.](https://arxiv.org/abs/1810.04805).
+
+The main part of pre-training is done in `deepspeed_train.py`, which has
+already been modified to use DeepSpeed. The  `ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh` is the shell script that launches the pre-training with DeepSpeed and PLD.
+
+```shell
+bash ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh
+```
+
+Most of the flags in the above script should be familiar if you have stepped through the BERT pre-training [tutorial](/tutorials/bert-pretraining/). To enable training with PLD, one needs to enable PLD in both the client script and in the DeepSpeed engine. To enable PLD in the client script, one needs to add the following command line flag to enable progressive layer dropping on Transformer blocks.
+
+```bash
+--progressive_layer_drop
+```
+
+To enable PLD in DeepSpeed, one needs to update the json configuration file with an appropriate PLD configuration dictionary like below:
+
+```json
+{
+  ...
+  "progressive_layer_drop": {
+    "enabled": true,
+    "theta": 0.5,
+    "gamma": 0.001
+  }
+}
+```
+
+we recommend a PLD theta value of 0.5 and gamma of 0.001 because these have worked well in our experiments.
+
+With these configuration changes, the DeepSpeed engine should print a runtime message as below:
+
+    [INFO] [logging.py:60:log_dist] [Rank 0] Enabled progressive layer dropping (theta = 0.5)
+
+The `deepspeed_bsz4k_progressive_layer_drop_config_seq128.json` file allows users to specify DeepSpeed options in terms of batch size, micro batch size, optimizer, learning rate, sequence length, and other parameters. Below is the DeepSpeed configuration file we use for running BERT and PLD.
+
+```json
+{
+  "train_batch_size": 4096,
+  "train_micro_batch_size_per_gpu": 16,
+  "steps_per_print": 1000,
+  "prescale_gradients": true,
+  "gradient_predivide_factor": 8,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-3,
+      "weight_decay": 0.01,
+      "bias_correction": false
+    }
+  },
+  "gradient_clipping": 1.0,
+
+  "wall_clock_breakdown": false,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0
+  },
+
+  "progressive_layer_drop": {
+    "enabled": true,
+    "theta": 0.5,
+    "gamma": 0.001
+  }
+}
+```
+
+Note that the above configuration assumes training on 64 X 32GB V100 GPUs. Each GPU uses a micro batch size of 16 and accumulates gradients until the effective batch size reaches 4096. If you have GPUs with less memory, you may need to reduce "train_micro_batch_size_per_gpu". Alternatively, if you have more GPUs, you can increase the "train_batch_size" to increase training speed. We use the following hyperparameters for pre-training BERT with PLD enabled.
+
+| Parameters                     | Value                   |
+| ------------------------------ | ----------------------- |
+| Effective batch size           | 4K                      |
+| Train micro batch size per GPU | 16                      |
+| Optimizer                      | Adam                    |
+| Peak learning rate             | 1e-3                    |
+| Sequence-length                | 128                     |
+| Learning rate scheduler        | Warmup linear decay exp |
+| Warmup ratio                   | 0.02                    |
+| Decay rate                     | 0.99                    |
+| Decay step                     | 1000                    |
+| Weight decay                   | 0.01                    |
+| Gradient clipping              | 1.0                     |
+
+Table 1. Pre-training hyperparameters
+
+**Note:** DeepSpeed now supports PreLayerNorm as the default way for training BERT, because of its ability to avoid vanishing gradient, stabilize optimization, and performance gains, as described in our fastest BERT training [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html). We therefore support the switchable Transformer block directly on the the BERT with PreLayerNorm. The implementation can be found at "example\bing_bert\nvidia\modelingpreln_layerdrop.py".
+
+## Fine-tuning with DeepSpeed on GLUE Tasks
+
+We use GLUE for fine-tuning tasks. GLUE (General Language Understanding Evaluation benchmark) (https://gluebenchmark.com/) is a  collection of sentence or sentence-pair natural language understanding tasks including question answering, sentiment analysis, and textual entailment.  It is designed to favor sample-efficient learning and knowledge-transfer across a range of different linguistic tasks in different domains.
+
+One can download all GLUE data using the provided helper [script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e). Once the data has been downloaded, one can set up the data and move the data to "/data/GlueData", which is the default location for hosting GLUE data. We then can use the PLD pre-trained BERT model checkpoint to run the fine-tuning.
+
+The main part of fine-tuning is done in `run_glue_classifier_bert_base.py`, which has
+already been modified to use DeepSpeed. Before the fine-tuning, one needs to specify the BERT model configuration through the following config in `run_glue_classifier_bert_base.py`. In this case, it has already been modified to be the same as the configuration of the pre-trained model.
+
+```json
+    bert_model_config = {
+        "vocab_size_or_config_json_file": 119547,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02
+    }
+```
+
+Next, one can load a DeepSpeed style checkpoint with the following command, which has also already been added in the script.
+
+```shell
+model.load_state_dict(checkpoint_state_dict['module'], strict=False)
+```
+
+Finally, the `run_glue_classifier_bert_base.sh` script invokes pre-training and setups several hyperparameters relevant to fine-tuning.
+
+```shell
+bash run_glue_bert_base_finetune.sh [task] [batch size] [learning rate] [number of epochs] [job name] [checkpoint path]
+```
+
+An example would be:
+
+```shell
+bash run_glue_bert_base_finetune.sh MNLI 32 3e-5 5 "fine_tune_MNLI" deepspeed_checkpoint.pt
+```
+
+
+
+### Expected Results
+
+The fine-tuning results can be found under the "logs" directory, and below are expected results for PLD on GLUE tasks. The "Lr" row indicates the learning rate we use for getting the corresponding accuracy result for each task.
+
+|                        | RTE  | MRPC      | STS-B     | CoLA | SST-2 | QNLI | QQP       | MNLI-m/mm | GLUE |
+| ---------------------- | :--: | --------- | --------- | ---- | ----- | ---- | --------- | --------- | ---- |
+| Metrics                | Acc. | F1/Acc.   | PCC/SCC   | Acc. | Acc.  | Acc. | F1/Acc.   | Acc.      |      |
+| Bert_{base} (original) | 66.4 | 88.9/84.8 | 87.1/89.2 | 52.1 | 93.5  | 90.5 | 71.2/89.2 | 84.6/83.4 | 80.7 |
+| Bert_{base} (Our impl) | 67.8 | 88.0/86.0 | 89.5/89.2 | 52.5 | 91.2  | 87.1 | 89.0/90.6 | 82.5/83.4 | 82.1 |
+| PLD                    | 69.3 | 86.6/84.3 | 90.0/89.6 | 55.8 | 91.6  | 90.7 | 89.6/91.2 | 84.1/83.8 | 82.9 |
+| Lr                     | 7e-5 | 9e-5      | 7e-5      | 5e-5 | 7e-5  | 9e-5 | 2e-4      | 3e-5      |      |
--- a/docs/_tutorials/zero-offload.md
+++ b/docs/_tutorials/zero-offload.md
@ -1,75 +1,74 @@
---
-title: "ZeRO-Offload"
---
-ZeRO-3 Offload consists of a subset of features in our newly released ZeRO-Infinity. Read our [ZeRO-Infinity blog](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) to learn more!
-
-We recommend that you read the tutorials on [Getting Started](/getting-started/)  and [ZeRO](/tutorials/zero/) before stepping through this tutorial.
-
-ZeRO-Offload is a ZeRO optimization that offloads the optimizer memory and computation from the GPU to the host CPU. ZeRO-Offload enables large models with up to 13 billion parameters to be efficiently trained on a single GPU. In this tutorial we will use ZeRO-Offload to train a 10-billion parameter GPT-2 model in DeepSpeed. Furthermore, *using ZeRO-Offload in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration json*. No code changes are needed.
-
-## ZeRO-Offload Overview
-For large model training, optimizers such as [Adam](https://arxiv.org/abs/1412.6980), can consume a significant amount of GPU compute and memory. ZeRO-Offload reduces the GPU compute and memory requirements of such models by leveraging compute and memory resources on the host CPU  to execute the optimizer. Furthermore, to prevent the optimizer from becoming a bottleneck, ZeRO-Offload uses DeepSpeed's highly optimized CPU implementation of Adam called [DeeSpeedCPUAdam](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/ops/adam). DeepSpeedCPUAdam is 5X--7X faster than the standard PyTorch implementation. To deep dive into the design and performance of ZeRO-Offload, please see our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3).
-
-## Training Environment
-For this tutorial, we will configure a 10 billion parameter GPT-2 model using the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3) GPT-2 code. We advise stepping through the Megatron-LM [tutorial](/tutorials/megatron/) if you have not previously done so. We will use a single [NVIDIA Tesla V100-SXM3 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM for this exercise.
-
-## Training a 10B parameter GPT-2 on 1 V100 GPU
-We need to make changes to the Megatron-LM launch script and to the DeepSpeed configuration json.
-
-### Megatron-LM GPT-2 launch script changes
-We need to apply two changes to the launch script for the DeepSpeed Megatron-LM GPT-2 model. The first change is to configure a 10B parameter GPT-2 model with activation checkpointing enabled, which can be achieved by the following set of changes:
-
-```bash
-       --model-parallel-size 1 \
-       --num-layers 50 \
-       --hidden-size 4096 \
-       --num-attention-heads 32 \
-       --batch-size 10 \
-       --deepspeed_config ds_zero_offload.config \
-       --checkpoint-activations
-```
-
-Most of the flags in the changes above should be familiar if you have stepped through the Megatron-LM [tutorial](/tutorials/megatron/).
-
-Second, we need to apply the following changes to ensure that only one GPU is used for training.
-```bash
-   deepspeed --num_nodes 1 --num_gpus 1 ...
-```
-
-### DeepSpeed Configuration Changes
-ZeRO-Offload leverages much for ZeRO stage 2 mechanisms, and so the configuration changes to enable ZeRO-Offload is an extension of those required to enable ZeRO stage 2. The **zero_optimization** key to enable ZeRO-Offload is shown below:
-
-```json
-{
-    "zero_optimization": {
-        "stage": 2,
-        "cpu_offload": true,
-        "contiguous_gradients": true,
-        "overlap_comm": true
-    }
-}
-```
-
-As seen above, in addition to setting the _stage_ field to **2** (to enable ZeRO stage 2), we also need to set _cpu_offload_ flag to **true** to enable ZeRO-Offload optimizations. In addition, we can  set other ZeRO stage 2 optimization flags, such as _overlap_comm_ to tune ZeRO-Offload performance.  With these changes we can now run the model. We share some screenshots of the training below.
-
-Here is a screenshot of the training log:
-
-<a href="/assets/images/zero_offload_dp1_10B_log.png">
-<img src="/assets/images/zero_offload_dp1_10B_log.png">
-</a>
-
-
-Here is a screenshot of nvidia-smi showing that only GPU 0 is active during training:
-
-<a href="/assets/images/zero_offload_dp1_10B_smi.png">
-<img src="/assets/images/zero_offload_dp1_10B_smi.png">
-</a>
-
-Finally, here is a screenshot of htop showing host CPU and memory activity during optimizer computation:
-
-<a href="/assets/images/zero_offload_dp1_10B_cpu.png">
-<img src="/assets/images/zero_offload_dp1_10B_cpu.png">
-</a>
-
-Congratulations! You have completed the ZeRO-Offload tutorial.
-
+---
+title: "ZeRO-Offload"
+---
+ZeRO-3 Offload consists of a subset of features in our newly released ZeRO-Infinity. Read our [ZeRO-Infinity blog](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) to learn more!
+
+We recommend that you read the tutorials on [Getting Started](/getting-started/)  and [ZeRO](/tutorials/zero/) before stepping through this tutorial.
+
+ZeRO-Offload is a ZeRO optimization that offloads the optimizer memory and computation from the GPU to the host CPU. ZeRO-Offload enables large models with up to 13 billion parameters to be efficiently trained on a single GPU. In this tutorial we will use ZeRO-Offload to train a 10-billion parameter GPT-2 model in DeepSpeed. Furthermore, *using ZeRO-Offload in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration json*. No code changes are needed.
+
+## ZeRO-Offload Overview
+For large model training, optimizers such as [Adam](https://arxiv.org/abs/1412.6980), can consume a significant amount of GPU compute and memory. ZeRO-Offload reduces the GPU compute and memory requirements of such models by leveraging compute and memory resources on the host CPU  to execute the optimizer. Furthermore, to prevent the optimizer from becoming a bottleneck, ZeRO-Offload uses DeepSpeed's highly optimized CPU implementation of Adam called [DeeSpeedCPUAdam](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/ops/adam). DeepSpeedCPUAdam is 5X--7X faster than the standard PyTorch implementation. To deep dive into the design and performance of ZeRO-Offload, please see our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3).
+
+## Training Environment
+For this tutorial, we will configure a 10 billion parameter GPT-2 model using the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3) GPT-2 code. We advise stepping through the Megatron-LM [tutorial](/tutorials/megatron/) if you have not previously done so. We will use a single [NVIDIA Tesla V100-SXM3 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM for this exercise.
+
+## Training a 10B parameter GPT-2 on 1 V100 GPU
+We need to make changes to the Megatron-LM launch script and to the DeepSpeed configuration json.
+
+### Megatron-LM GPT-2 launch script changes
+We need to apply two changes to the launch script for the DeepSpeed Megatron-LM GPT-2 model. The first change is to configure a 10B parameter GPT-2 model with activation checkpointing enabled, which can be achieved by the following set of changes:
+
+```bash
+       --model-parallel-size 1 \
+       --num-layers 50 \
+       --hidden-size 4096 \
+       --num-attention-heads 32 \
+       --batch-size 10 \
+       --deepspeed_config ds_zero_offload.config \
+       --checkpoint-activations
+```
+
+Most of the flags in the changes above should be familiar if you have stepped through the Megatron-LM [tutorial](/tutorials/megatron/).
+
+Second, we need to apply the following changes to ensure that only one GPU is used for training.
+```bash
+   deepspeed --num_nodes 1 --num_gpus 1 ...
+```
+
+### DeepSpeed Configuration Changes
+ZeRO-Offload leverages much for ZeRO stage 2 mechanisms, and so the configuration changes to enable ZeRO-Offload is an extension of those required to enable ZeRO stage 2. The **zero_optimization** key to enable ZeRO-Offload is shown below:
+
+```json
+{
+    "zero_optimization": {
+        "stage": 2,
+        "cpu_offload": true,
+        "contiguous_gradients": true,
+        "overlap_comm": true
+    }
+}
+```
+
+As seen above, in addition to setting the _stage_ field to **2** (to enable ZeRO stage 2), we also need to set _cpu_offload_ flag to **true** to enable ZeRO-Offload optimizations. In addition, we can  set other ZeRO stage 2 optimization flags, such as _overlap_comm_ to tune ZeRO-Offload performance.  With these changes we can now run the model. We share some screenshots of the training below.
+
+Here is a screenshot of the training log:
+
+<a href="/assets/images/zero_offload_dp1_10B_log.png">
+<img src="/assets/images/zero_offload_dp1_10B_log.png">
+</a>
+
+
+Here is a screenshot of nvidia-smi showing that only GPU 0 is active during training:
+
+<a href="/assets/images/zero_offload_dp1_10B_smi.png">
+<img src="/assets/images/zero_offload_dp1_10B_smi.png">
+</a>
+
+Finally, here is a screenshot of htop showing host CPU and memory activity during optimizer computation:
+
+<a href="/assets/images/zero_offload_dp1_10B_cpu.png">
+<img src="/assets/images/zero_offload_dp1_10B_cpu.png">
+</a>
+
+Congratulations! You have completed the ZeRO-Offload tutorial.
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@ -1,301 +1,301 @@
---
-title: "Zero Redundancy Optimizer (ZeRO)"
---
-If you have not done so already, we advise that you read the DeepSpeed tutorials on [Getting Started](/getting-started/) and [Megatron-LM GPT-2](/tutorials/megatron/) before stepping through this tutorial.
-
-In this tutorial, we will apply the ZeRO optimizer to the [Megatron-LM GPT-2](https://github.com/NVIDIA/Megatron-LM) model. ZeRO is a powerful set of memory optimization techniques that enable effective FP16 training of large models with trillions of parameters, such as [GPT-2](https://openai.com/blog/better-language-models/) and [Turing-NLG 17B](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/). Compared to the alternative model parallelism approaches for training large models, a key appeal of ZeRO is that no model code modifications are required. As this tutorial will demonstrate, *using ZeRO in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration JSON*. No code changes are needed.
-
-## ZeRO Overview
-ZeRO leverages the aggregate computation and memory resources of data parallelism to reduce the memory and compute requirements of each device (GPU) used for model training. ZeRO reduces the memory consumption of each GPU by partitioning the various model training states (weights, gradients, and optimizer states) across the available devices (GPUs and CPUs) in the distributed training hardware. Concretely, ZeRO is being implemented as incremental stages of optimizations, where optimizations in earlier stages are available in the later stages. To deep dive into ZeRO, please see our [paper](https://arxiv.org/abs/1910.02054v3).
-
-* **Stage 1**: The optimizer states (e.g., for [Adam optimizer](https://arxiv.org/abs/1412.6980), 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
-
-* **Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
-
-* **Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
-
-In addition, ZeRO-3 includes the *infinity offload engine* to form ZeRO-Infinity ([paper](https://arxiv.org/abs/2104.07857)), which can offload to both CPU and NVMe memory for huge memory savings.
-
-## Training environment
-We use the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3) GPT-2 code for this exercise. You can step through the Megatron-LM [tutorial](/tutorials/megatron/) to familiarize yourself with the code. We will train the models in this tutorial on [NVIDIA Tesla V100-SXM3 Tensor Core GPUs](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM.
-
-## Enabling ZeRO Optimization
-To enable ZeRO optimizations for a DeepSpeed model, we simply add the **_zero_optimization_** key to the DeepSpeed JSON configuration. A full description of configuration knobs of the **zero_optimization** key is available [here](/docs/config-json/#zero-optimizations-for-fp16-training).
-
-### Training a 1.5B Parameter GPT-2 model
-We demonstrate the benefits of ZeRO stage 1 by showing that it enables data parallel training of a 1.5 billion parameter GPT-2 model on eight V100 GPUs. We configure training to use a batch size of 1 per device to ensure that the memory consumption is primarily due to model parameters and optimizer states. We create this training scenario by applying the following modifications to the deepspeed launch script:
-
-```bash
-       --model-parallel-size 1 \
-       --num-layers 48 \
-       --hidden-size 1600 \
-       --num-attention-heads 16 \
-       --batch-size 1 \
-       --deepspeed_config ds_zero_stage_1.config \
-```
-
-Training this model without ZeRO fails with an out-of-memory (OOM) error as shown below:
-
-<a href="/assets/images/oom_dp8_1.5B_log.png">
-<img src="/assets/images/oom_dp8_1.5B_log.png">
-</a>
-
-A key reason why this model does not fit in GPU memory is that the Adam optimizer states for the model consume 18GB; a significant portion of the 32GB RAM. By using ZeRO stage 1 to partition the optimizer state among eight data parallel ranks, the per-device memory consumption can be reduced to 2.25GB, thus making the model trainable. To enable ZeRO stage 1, we simply update the DeepSpeed JSON config file as below:
-
-```json
-{
-    "zero_optimization": {
-        "stage":1,
-        "reduce_bucket_size": 5e8
-    }
-}
-```
-As seen above, we set two fields in the **zero_optimization** key. Specifically we set the _stage_ field to 1, and the optional _reduce_bucket_size_ for gradient reduction to 500M. With ZeRO stage 1 enabled, the model can now train smoothly on 8 GPUs without running out of memory.   Below we provide some screenshots of the model training:
-
-
-<a href="/assets/images/zero1_dp8_1.5B_log.png">
-<img src="/assets/images/zero1_dp8_1.5B_log.png">
-</a>
-
-<a href="/assets/images/zero1_dp8_1.5B_smi.png">
-<img src="/assets/images/zero1_dp8_1.5B_smi.png">
-</a>
-
-
-From the nvidia-smi screenshot above we can see that only GPUs 6-7 are being used for training the model. With ZeRO stage 1 we can further reduce the per-device memory consumption by increasing the data parallelism degree. These memory savings can be leveraged to either increase model size and/or batch size. In contrast, such benefits are not possible with data parallelism alone.
-
-### Training a 10B Parameter GPT-2 model
-ZeRO stage 2 optimizations further increases the size of models that can be trained using data parallelism. We show this by training a model with 10B parameters using 32 V100 GPUs.
-
-First, we need to configure a 10B parameter model with activation checkpointing enabled. This can be done by applying the following GPT-2 model configuration changes to the DeepSpeed launch script.
-
-```bash
-       --model-parallel-size 1 \
-       --num-layers 50 \
-       --hidden-size 4096 \
-       --num-attention-heads 32 \
-       --batch-size 1 \
-       --deepspeed_config ds_zero_stage_2.config \
-       --checkpoint-activations
-```
-
-Next, we need to update the DeepSpeed JSON configuration, as shown below, to enable ZeRO stage 2 optimizations:
-
-```json
-{
-    "zero_optimization": {
-        "stage":2,
-        "contiguous_gradients": true,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 5e8,
-        "allgather_bucket_size": 5e8
-    }
-}
-```
-
-In the above changes, we have set the _stage_ field to 2, and configured other optimization knobs that are available in ZeRO stage 2. For example, we have enabled _contiguous_gradients_ to reduce memory fragmentation during backward pass. A full description of these optimization knobs is available [here](/docs/config-json/#zero-optimizations-for-fp16-training). With these changes, we can now launch the training run.
-
-Here is a screenshot of the training log:
-
-<a href="/assets/images/zero2_dp32_10B_log.png">
-<img src="/assets/images/zero2_dp32_10B_log.png">
-</a>
-
-Here is a screenshot of nvidia-smi showing GPU activity during training:
-
-<a href="/assets/images/zero2_dp32_10B_smi.png">
-<img src="/assets/images/zero2_dp32_10B_smi.png">
-</a>
-
-### Training trillion-scale models with ZeRO-Infinity
-
-ZeRO-3, the third stage of ZeRO, partitions the full model state (i.e.,
-weights, gradients, and optimizer states) to scale memory savings linearly
-with the degree of data parallelism. ZeRO-3 can be enabled in the JSON
-configuration. A full description of these configurations is available
-[here](/docs/config-json/#zero-optimizations-for-fp16-training).
-
-
-#### Offloading to CPU and NVMe with ZeRO-Infinity
-
-ZeRO-Infinity uses DeepSpeed's infinity offload engine to offload the full
-model state to CPU or NVMe memory, allowing for even larger model sizes. Offloading
-can be enabled inside the DeepSpeed configuration:
-
-```diff
-@@ -6,5 +6,11 @@
-  "zero_optimization": {
-    "stage": 3,
-    "contiguous_gradients": true,
-    "stage3_max_live_parameters": 1e9,
-    "stage3_max_reuse_distance": 1e9,
-     "stage3_prefetch_bucket_size": 1e7,
-     "stage3_param_persistence_threshold": 1e5,
-     "reduce_bucket_size": 1e7,
-    "sub_group_size": 1e9
-+    "sub_group_size": 1e9,
-+    "offload_optimizer": {
-+      "device": "cpu"
-+    },
-+    "offload_param": {
-+      "device": "cpu"
-+    }
-  }
-```
-
-**ZeRO-Infinity vs ZeRO-Offload:**
-DeepSpeed first included offloading capabilities with ZeRO-Offload,
-a system for offloading optimizer and gradient states to CPU memory
-within ZeRO-2. ZeRO-Infinity is the next generation of offloading
-capabilities accessible to ZeRO-3. ZeRO-Infinity is able to offload
-more data than ZeRO-Offload and has more effective bandwidth utilization
-and overlapping of computation and communication.
-{: .notice--info}
-
-
-
-
-#### Allocating Massive Megatron-LM Models
-
-We make two further changes to model initialization in order to support models
-that exceed *local* system memory, but not *total* system memory.
-
-1. Allocate the model in a memory-scalable fashion. The model parameters will
-be allocated and immediately partitioned across the data parallel group. If
-`remote_device` is  `"cpu"` or `"nvme"`, the model will also be allocated in CPU/NVMe memory
-instead of GPU memory. Please see the full
-[ZeRO-3 Init docs](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init)
-for more details.
-
-    ```python
-    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
-                             remote_device=get_args().remote_device,
-                             enabled=get_args().zero_stage==3):
-        model = GPT2Model(num_tokentypes=0, parallel_output=True)
-    ```
-
-2. Gather the embeddings weight for initialization. DeepSpeed will automatically
-gather a module's parameters during its constructor and for its forward and backward pass.
-However, additional accesses must coordinate with DeepSpeed to ensure that parameter data
-is gathered and subsequently partitioned. If the tensor is modified, the `modifier_rank`
-argument should also be used to ensure all ranks have a consistent view of
-the data. Please see the full
-[GatheredParameters docs](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.GatheredParameters)
-for more details.
-
-    ```python
-    self.position_embeddings = torch.nn.Embedding(...)
-    with deepspeed.zero.GatheredParameters(self.position_embeddings.weight,
-                                           modifier_rank=0):
-        # Initialize the position embeddings.
-        self.init_method(self.position_embeddings.weight)
-
-    ...
-
-    self.tokentype_embeddings = torch.nn.Embedding(...)
-    with deepspeed.zero.GatheredParameters(self.tokentype_embeddings.weight,
-                                        modifier_rank=0):
-        # Initialize the token-type embeddings.
-        self.init_method(self.tokentype_embeddings.weight)
-    ```
-
-#### Memory-centric tiling
-ZeRO-Infinity includes a replacement for `Linear` layers that further reduces memory.
-We optionally tile the model parallel linear layers found in each Transformer layer. Note
-that model parallelism and tiling can be combined by specifying the corresponding
-base class when building the layer.
-The `deepspeed.zero.TiledLinear` module exploits the data fetch and release
-pattern of ZeRO-3 to reduce the working memory requirements by breaking down
-a large operator into smaller tiles that can be executed sequentially.
-
-We include the changes for one example from Megatron-LM's [ParallelMLP](https://github.com/microsoft/DeepSpeedExamples/blob/bdf8e59aede8c8e0577e8d4d557298ca8515268f/Megatron-LM-v1.1.5-ZeRO3/megatron/model/transformer.py#L82). Three more
-model-parallel layers in `transformer.py` proceed similarly.
-
-The model parallel layers of Megatron-LM have a special form in which the
-additive `bias` of the layer is delayed and instead returned from `forward()`
-to be fused with a later operator. DeepSpeed's
-`deepspeed.zero.TiledLinearReturnBias` subclass of `TiledLinear` simply also
-forwards the returned `bias` parameter without accumulating.
-
-```diff
-@@ -1,6 +1,9 @@
-self.dense_h_to_4h = mpu.ColumnParallelLinear(
-+self.dense_h_to_4h = deepspeed.zero.TiledLinearReturnBias(
-     args.hidden_size,
-     4 * args.hidden_size,
-+    in_splits=args.tile_factor,
-+    out_splits=4*args.tile_factor,
-+    linear_cls=mpu.ColumnParallelLinear,
-     gather_output=False,
-     init_method=init_method,
-     skip_bias_add=True)
-```
-
-Note that we scale `in_splits` and `out_splits` proportionally with `input_size` and `output_size`.  This
-results in tiles of fixed size `[hidden/tile_factor, hidden/tile_factor]`.
-
-#### Registering external parameters
-
-**Deprecated:**
-DeepSpeed version `0.3.15` introduced automatic external parameter
-registration and this step is no longer needed.
-{: .notice--info}
-
-
-## Extracting weights
-
-If you need to take the pretrained weights out of Deepspeed here is what you can do for getting fp16 weights:
-
- under ZeRO-2 `state_dict` contains the fp16 model weights and these can be saved normally with `torch.save`.
- under ZeRO-3 `state_dict` contains just the placeholders since the model weights are partitioned across multiple GPUs. If you want to get to these weights enable:
-
-```json
-    "zero_optimization": {
-        "stage3_gather_fp16_weights_on_model_save": true
-    },
-```
-And then save the model using:
-
-```python
-            if self.deepspeed:
-                self.deepspeed.save_fp16_model(output_dir, output_file)
-```
-
-Because it requires consolidation of the weights on one GPU it can be slow and memory demanding, so only use this feature when needed.
-
-Note that if `stage3_gather_fp16_weights_on_model_save` is `False`, no weights will be saved (again, because `state_dict` doesn't have them.
-You can use this method to save ZeRO-2 weights as well.
-
-If you'd like to get the fp32 weights, we supply a special script that can do offline consolidation. It requires no configuration files or GPUs. Here is an example of its usage:
-
-``` bash
-$ cd /path/to/checkpoint_dir
-$ ./zero_to_fp32.py . pytorch_model.bin
-Processing zero checkpoint at global_step1
-Detected checkpoint of type zero stage 3, world_size: 2
-Saving fp32 state dict to pytorch_model.bin (total_numel=60506624)
-```
-
-The `zero_to_fp32.py` gets created automatically when you save a checkpoint.
-
-Note: currently this script uses 2x memory (general RAM) of the size of the final checkpoint.
-
-Alternatively, if you have plenty of spare CPU memory and instead of getting the file you want your model to be updated to its fp32 weights, you can do the following at the end of the training:
-
-``` python
-    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
-    fp32_model = load_state_dict_from_zero_checkpoint(deepspeed.module, checkpoint_dir)
-```
-
-Beware, that the model will be good for saving, but no longer good for continuing the training and will require a `deepspeed.initialize()` anew.
-
-If you just want the `state_dict`, you can do:
-
-``` python
-    from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
-    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)
-```
-
-
-Congratulations! You have completed the ZeRO tutorial.
+---
+title: "Zero Redundancy Optimizer (ZeRO)"
+---
+If you have not done so already, we advise that you read the DeepSpeed tutorials on [Getting Started](/getting-started/) and [Megatron-LM GPT-2](/tutorials/megatron/) before stepping through this tutorial.
+
+In this tutorial, we will apply the ZeRO optimizer to the [Megatron-LM GPT-2](https://github.com/NVIDIA/Megatron-LM) model. ZeRO is a powerful set of memory optimization techniques that enable effective FP16 training of large models with trillions of parameters, such as [GPT-2](https://openai.com/blog/better-language-models/) and [Turing-NLG 17B](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/). Compared to the alternative model parallelism approaches for training large models, a key appeal of ZeRO is that no model code modifications are required. As this tutorial will demonstrate, *using ZeRO in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration JSON*. No code changes are needed.
+
+## ZeRO Overview
+ZeRO leverages the aggregate computation and memory resources of data parallelism to reduce the memory and compute requirements of each device (GPU) used for model training. ZeRO reduces the memory consumption of each GPU by partitioning the various model training states (weights, gradients, and optimizer states) across the available devices (GPUs and CPUs) in the distributed training hardware. Concretely, ZeRO is being implemented as incremental stages of optimizations, where optimizations in earlier stages are available in the later stages. To deep dive into ZeRO, please see our [paper](https://arxiv.org/abs/1910.02054v3).
+
+* **Stage 1**: The optimizer states (e.g., for [Adam optimizer](https://arxiv.org/abs/1412.6980), 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
+
+* **Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
+
+* **Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
+
+In addition, ZeRO-3 includes the *infinity offload engine* to form ZeRO-Infinity ([paper](https://arxiv.org/abs/2104.07857)), which can offload to both CPU and NVMe memory for huge memory savings.
+
+## Training environment
+We use the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3) GPT-2 code for this exercise. You can step through the Megatron-LM [tutorial](/tutorials/megatron/) to familiarize yourself with the code. We will train the models in this tutorial on [NVIDIA Tesla V100-SXM3 Tensor Core GPUs](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM.
+
+## Enabling ZeRO Optimization
+To enable ZeRO optimizations for a DeepSpeed model, we simply add the **_zero_optimization_** key to the DeepSpeed JSON configuration. A full description of configuration knobs of the **zero_optimization** key is available [here](/docs/config-json/#zero-optimizations-for-fp16-training).
+
+### Training a 1.5B Parameter GPT-2 model
+We demonstrate the benefits of ZeRO stage 1 by showing that it enables data parallel training of a 1.5 billion parameter GPT-2 model on eight V100 GPUs. We configure training to use a batch size of 1 per device to ensure that the memory consumption is primarily due to model parameters and optimizer states. We create this training scenario by applying the following modifications to the deepspeed launch script:
+
+```bash
+       --model-parallel-size 1 \
+       --num-layers 48 \
+       --hidden-size 1600 \
+       --num-attention-heads 16 \
+       --batch-size 1 \
+       --deepspeed_config ds_zero_stage_1.config \
+```
+
+Training this model without ZeRO fails with an out-of-memory (OOM) error as shown below:
+
+<a href="/assets/images/oom_dp8_1.5B_log.png">
+<img src="/assets/images/oom_dp8_1.5B_log.png">
+</a>
+
+A key reason why this model does not fit in GPU memory is that the Adam optimizer states for the model consume 18GB; a significant portion of the 32GB RAM. By using ZeRO stage 1 to partition the optimizer state among eight data parallel ranks, the per-device memory consumption can be reduced to 2.25GB, thus making the model trainable. To enable ZeRO stage 1, we simply update the DeepSpeed JSON config file as below:
+
+```json
+{
+    "zero_optimization": {
+        "stage":1,
+        "reduce_bucket_size": 5e8
+    }
+}
+```
+As seen above, we set two fields in the **zero_optimization** key. Specifically we set the _stage_ field to 1, and the optional _reduce_bucket_size_ for gradient reduction to 500M. With ZeRO stage 1 enabled, the model can now train smoothly on 8 GPUs without running out of memory.   Below we provide some screenshots of the model training:
+
+
+<a href="/assets/images/zero1_dp8_1.5B_log.png">
+<img src="/assets/images/zero1_dp8_1.5B_log.png">
+</a>
+
+<a href="/assets/images/zero1_dp8_1.5B_smi.png">
+<img src="/assets/images/zero1_dp8_1.5B_smi.png">
+</a>
+
+
+From the nvidia-smi screenshot above we can see that only GPUs 6-7 are being used for training the model. With ZeRO stage 1 we can further reduce the per-device memory consumption by increasing the data parallelism degree. These memory savings can be leveraged to either increase model size and/or batch size. In contrast, such benefits are not possible with data parallelism alone.
+
+### Training a 10B Parameter GPT-2 model
+ZeRO stage 2 optimizations further increases the size of models that can be trained using data parallelism. We show this by training a model with 10B parameters using 32 V100 GPUs.
+
+First, we need to configure a 10B parameter model with activation checkpointing enabled. This can be done by applying the following GPT-2 model configuration changes to the DeepSpeed launch script.
+
+```bash
+       --model-parallel-size 1 \
+       --num-layers 50 \
+       --hidden-size 4096 \
+       --num-attention-heads 32 \
+       --batch-size 1 \
+       --deepspeed_config ds_zero_stage_2.config \
+       --checkpoint-activations
+```
+
+Next, we need to update the DeepSpeed JSON configuration, as shown below, to enable ZeRO stage 2 optimizations:
+
+```json
+{
+    "zero_optimization": {
+        "stage":2,
+        "contiguous_gradients": true,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "allgather_bucket_size": 5e8
+    }
+}
+```
+
+In the above changes, we have set the _stage_ field to 2, and configured other optimization knobs that are available in ZeRO stage 2. For example, we have enabled _contiguous_gradients_ to reduce memory fragmentation during backward pass. A full description of these optimization knobs is available [here](/docs/config-json/#zero-optimizations-for-fp16-training). With these changes, we can now launch the training run.
+
+Here is a screenshot of the training log:
+
+<a href="/assets/images/zero2_dp32_10B_log.png">
+<img src="/assets/images/zero2_dp32_10B_log.png">
+</a>
+
+Here is a screenshot of nvidia-smi showing GPU activity during training:
+
+<a href="/assets/images/zero2_dp32_10B_smi.png">
+<img src="/assets/images/zero2_dp32_10B_smi.png">
+</a>
+
+### Training trillion-scale models with ZeRO-Infinity
+
+ZeRO-3, the third stage of ZeRO, partitions the full model state (i.e.,
+weights, gradients, and optimizer states) to scale memory savings linearly
+with the degree of data parallelism. ZeRO-3 can be enabled in the JSON
+configuration. A full description of these configurations is available
+[here](/docs/config-json/#zero-optimizations-for-fp16-training).
+
+
+#### Offloading to CPU and NVMe with ZeRO-Infinity
+
+ZeRO-Infinity uses DeepSpeed's infinity offload engine to offload the full
+model state to CPU or NVMe memory, allowing for even larger model sizes. Offloading
+can be enabled inside the DeepSpeed configuration:
+
+```diff
+@@ -6,5 +6,11 @@
+  "zero_optimization": {
+    "stage": 3,
+    "contiguous_gradients": true,
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+     "stage3_prefetch_bucket_size": 1e7,
+     "stage3_param_persistence_threshold": 1e5,
+     "reduce_bucket_size": 1e7,
+-    "sub_group_size": 1e9
+    "sub_group_size": 1e9,
+    "offload_optimizer": {
+      "device": "cpu"
+    },
+    "offload_param": {
+      "device": "cpu"
+    }
+  }
+```
+
+**ZeRO-Infinity vs ZeRO-Offload:**
+DeepSpeed first included offloading capabilities with ZeRO-Offload,
+a system for offloading optimizer and gradient states to CPU memory
+within ZeRO-2. ZeRO-Infinity is the next generation of offloading
+capabilities accessible to ZeRO-3. ZeRO-Infinity is able to offload
+more data than ZeRO-Offload and has more effective bandwidth utilization
+and overlapping of computation and communication.
+{: .notice--info}
+
+
+
+
+#### Allocating Massive Megatron-LM Models
+
+We make two further changes to model initialization in order to support models
+that exceed *local* system memory, but not *total* system memory.
+
+1. Allocate the model in a memory-scalable fashion. The model parameters will
+be allocated and immediately partitioned across the data parallel group. If
+`remote_device` is  `"cpu"` or `"nvme"`, the model will also be allocated in CPU/NVMe memory
+instead of GPU memory. Please see the full
+[ZeRO-3 Init docs](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init)
+for more details.
+
+    ```python
+    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
+                             remote_device=get_args().remote_device,
+                             enabled=get_args().zero_stage==3):
+        model = GPT2Model(num_tokentypes=0, parallel_output=True)
+    ```
+
+2. Gather the embeddings weight for initialization. DeepSpeed will automatically
+gather a module's parameters during its constructor and for its forward and backward pass.
+However, additional accesses must coordinate with DeepSpeed to ensure that parameter data
+is gathered and subsequently partitioned. If the tensor is modified, the `modifier_rank`
+argument should also be used to ensure all ranks have a consistent view of
+the data. Please see the full
+[GatheredParameters docs](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.GatheredParameters)
+for more details.
+
+    ```python
+    self.position_embeddings = torch.nn.Embedding(...)
+    with deepspeed.zero.GatheredParameters(self.position_embeddings.weight,
+                                           modifier_rank=0):
+        # Initialize the position embeddings.
+        self.init_method(self.position_embeddings.weight)
+
+    ...
+
+    self.tokentype_embeddings = torch.nn.Embedding(...)
+    with deepspeed.zero.GatheredParameters(self.tokentype_embeddings.weight,
+                                        modifier_rank=0):
+        # Initialize the token-type embeddings.
+        self.init_method(self.tokentype_embeddings.weight)
+    ```
+
+#### Memory-centric tiling
+ZeRO-Infinity includes a replacement for `Linear` layers that further reduces memory.
+We optionally tile the model parallel linear layers found in each Transformer layer. Note
+that model parallelism and tiling can be combined by specifying the corresponding
+base class when building the layer.
+The `deepspeed.zero.TiledLinear` module exploits the data fetch and release
+pattern of ZeRO-3 to reduce the working memory requirements by breaking down
+a large operator into smaller tiles that can be executed sequentially.
+
+We include the changes for one example from Megatron-LM's [ParallelMLP](https://github.com/microsoft/DeepSpeedExamples/blob/bdf8e59aede8c8e0577e8d4d557298ca8515268f/Megatron-LM-v1.1.5-ZeRO3/megatron/model/transformer.py#L82). Three more
+model-parallel layers in `transformer.py` proceed similarly.
+
+The model parallel layers of Megatron-LM have a special form in which the
+additive `bias` of the layer is delayed and instead returned from `forward()`
+to be fused with a later operator. DeepSpeed's
+`deepspeed.zero.TiledLinearReturnBias` subclass of `TiledLinear` simply also
+forwards the returned `bias` parameter without accumulating.
+
+```diff
+@@ -1,6 +1,9 @@
+-self.dense_h_to_4h = mpu.ColumnParallelLinear(
+self.dense_h_to_4h = deepspeed.zero.TiledLinearReturnBias(
+     args.hidden_size,
+     4 * args.hidden_size,
+    in_splits=args.tile_factor,
+    out_splits=4*args.tile_factor,
+    linear_cls=mpu.ColumnParallelLinear,
+     gather_output=False,
+     init_method=init_method,
+     skip_bias_add=True)
+```
+
+Note that we scale `in_splits` and `out_splits` proportionally with `input_size` and `output_size`.  This
+results in tiles of fixed size `[hidden/tile_factor, hidden/tile_factor]`.
+
+#### Registering external parameters
+
+**Deprecated:**
+DeepSpeed version `0.3.15` introduced automatic external parameter
+registration and this step is no longer needed.
+{: .notice--info}
+
+
+## Extracting weights
+
+If you need to take the pretrained weights out of Deepspeed here is what you can do for getting fp16 weights:
+
+- under ZeRO-2 `state_dict` contains the fp16 model weights and these can be saved normally with `torch.save`.
+- under ZeRO-3 `state_dict` contains just the placeholders since the model weights are partitioned across multiple GPUs. If you want to get to these weights enable:
+
+```json
+    "zero_optimization": {
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+```
+And then save the model using:
+
+```python
+            if self.deepspeed:
+                self.deepspeed.save_fp16_model(output_dir, output_file)
+```
+
+Because it requires consolidation of the weights on one GPU it can be slow and memory demanding, so only use this feature when needed.
+
+Note that if `stage3_gather_fp16_weights_on_model_save` is `False`, no weights will be saved (again, because `state_dict` doesn't have them.
+You can use this method to save ZeRO-2 weights as well.
+
+If you'd like to get the fp32 weights, we supply a special script that can do offline consolidation. It requires no configuration files or GPUs. Here is an example of its usage:
+
+``` bash
+$ cd /path/to/checkpoint_dir
+$ ./zero_to_fp32.py . pytorch_model.bin
+Processing zero checkpoint at global_step1
+Detected checkpoint of type zero stage 3, world_size: 2
+Saving fp32 state dict to pytorch_model.bin (total_numel=60506624)
+```
+
+The `zero_to_fp32.py` gets created automatically when you save a checkpoint.
+
+Note: currently this script uses 2x memory (general RAM) of the size of the final checkpoint.
+
+Alternatively, if you have plenty of spare CPU memory and instead of getting the file you want your model to be updated to its fp32 weights, you can do the following at the end of the training:
+
+``` python
+    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+    fp32_model = load_state_dict_from_zero_checkpoint(deepspeed.module, checkpoint_dir)
+```
+
+Beware, that the model will be good for saving, but no longer good for continuing the training and will require a `deepspeed.initialize()` anew.
+
+If you just want the `state_dict`, you can do:
+
+``` python
+    from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)
+```
+
+
+Congratulations! You have completed the ZeRO tutorial.
--- a/docs/code-docs/source/schedulers.rst
+++ b/docs/code-docs/source/schedulers.rst
@ -1,25 +1,25 @@
-Learning Rate Schedulers
-===================
-
-DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers. When using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
-  * if the schedule is supposed to execute at every training step, then the user can pass the scheduler to `deepspeed.initialize` when initializing the DeepSpeed engine and let DeepSpeed manage it for update or save/restore.
-  * if the schedule is supposed to execute at any other interval (e.g., training epochs), then the user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly.
-
-LRRangeTest
---------------------------
-.. autoclass:: deepspeed.runtime.lr_schedules.LRRangeTest
-
-
-OneCycle
---------------------------
-.. autoclass:: deepspeed.runtime.lr_schedules.OneCycle
-
-
-WarmupLR
---------------------------
-.. autoclass:: deepspeed.runtime.lr_schedules.WarmupLR
-
-
-WarmupDecayLR
---------------------------
-.. autoclass:: deepspeed.runtime.lr_schedules.WarmupDecayLR
+Learning Rate Schedulers
+===================
+
+DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers. When using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
+  * if the schedule is supposed to execute at every training step, then the user can pass the scheduler to `deepspeed.initialize` when initializing the DeepSpeed engine and let DeepSpeed manage it for update or save/restore.
+  * if the schedule is supposed to execute at any other interval (e.g., training epochs), then the user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly.
+
+LRRangeTest
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.LRRangeTest
+
+
+OneCycle
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.OneCycle
+
+
+WarmupLR
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.WarmupLR
+
+
+WarmupDecayLR
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.WarmupDecayLR
--- a/tests/perf/adam_test.py
+++ b/tests/perf/adam_test.py
@ -1,24 +1,24 @@
-import torch
-from deepspeed.ops.adam import DeepSpeedCPUAdam
-import time
-
-device = 'cpu'
-model_size = 1 * 1024**3
-group_size = [model_size, 274432]
-
-param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
-optimizer = DeepSpeedCPUAdam(param)
-#torch.set_num_threads(128)
-for i, p in enumerate(param):
-    p.grad = torch.ones(group_size[i], device=device)
-#param.grad = torch.ones(model_size, device=device)
-avg = 0
-for i in range(100):
-    start = time.time()
-    optimizer.step()
-    stop = time.time()
-    avg += (stop - start)
-    for i, p in enumerate(param):
-        p.grad = torch.ones(group_size[i], device=device) * 2
-    #param.grad = torch.ones(model_size, device=device) * 2
-print("Elapsed Time is ", avg / 100)
+import torch
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+import time
+
+device = 'cpu'
+model_size = 1 * 1024**3
+group_size = [model_size, 274432]
+
+param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
+optimizer = DeepSpeedCPUAdam(param)
+#torch.set_num_threads(128)
+for i, p in enumerate(param):
+    p.grad = torch.ones(group_size[i], device=device)
+#param.grad = torch.ones(model_size, device=device)
+avg = 0
+for i in range(100):
+    start = time.time()
+    optimizer.step()
+    stop = time.time()
+    avg += (stop - start)
+    for i, p in enumerate(param):
+        p.grad = torch.ones(group_size[i], device=device) * 2
+    #param.grad = torch.ones(model_size, device=device) * 2
+print("Elapsed Time is ", avg / 100)
--- a/tests/perf/adam_test1.py
+++ b/tests/perf/adam_test1.py
@ -1,22 +1,22 @@
-import torch
-from deepspeed.ops.adam import DeepSpeedCPUAdam
-import time
-
-device = 'cpu'
-model_size = 1 * 1024**3
-param = torch.nn.Parameter(torch.ones(model_size, device=device))
-param_fp16 = torch.nn.Parameter(torch.ones(model_size,
-                                           dtype=torch.half,
-                                           device='cuda:0'))
-
-optimizer = DeepSpeedCPUAdam([param])
-#torch.set_num_threads(128)
-param.grad = torch.ones(model_size, device=device)
-avg = 0
-for i in range(100):
-    start = time.time()
-    optimizer.step(fp16_param_groups=[param_fp16])
-    stop = time.time()
-    avg += (stop - start)
-    param.grad = torch.ones(model_size, device=device) * 2
-print("Elapsed Time is ", avg / 100)
+import torch
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+import time
+
+device = 'cpu'
+model_size = 1 * 1024**3
+param = torch.nn.Parameter(torch.ones(model_size, device=device))
+param_fp16 = torch.nn.Parameter(torch.ones(model_size,
+                                           dtype=torch.half,
+                                           device='cuda:0'))
+
+optimizer = DeepSpeedCPUAdam([param])
+#torch.set_num_threads(128)
+param.grad = torch.ones(model_size, device=device)
+avg = 0
+for i in range(100):
+    start = time.time()
+    optimizer.step(fp16_param_groups=[param_fp16])
+    stop = time.time()
+    avg += (stop - start)
+    param.grad = torch.ones(model_size, device=device) * 2
+print("Elapsed Time is ", avg / 100)
--- a/tests/unit/ds_batch_config.json
+++ b/tests/unit/ds_batch_config.json
@ -1,15 +1,15 @@
-{
-    "train_batch_size": 2,
-    "gradient_accumulation_steps": 1,
-    "optimizer": {
-      "type": "Adam",
-      "params": {
-        "lr": 0.00015
-      }
-    },
-    "gradient_clipping": 1.0,
-    "fp16": {
-      "enabled": true,
-      "loss_scale": 0
-    }
-  }
+{
+    "train_batch_size": 2,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+      "type": "Adam",
+      "params": {
+        "lr": 0.00015
+      }
+    },
+    "gradient_clipping": 1.0,
+    "fp16": {
+      "enabled": true,
+      "loss_scale": 0
+    }
+  }
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
--- a/tests/unit/test_aio.py
+++ b/tests/unit/test_aio.py
@ -1,335 +1,335 @@
-import pytest
-import os
-import filecmp
-import torch
-import deepspeed
-import torch.distributed as dist
-from common import distributed_test
-from deepspeed.ops.aio import AsyncIOBuilder
-
-MEGA_BYTE = 1024**2
-BLOCK_SIZE = MEGA_BYTE
-QUEUE_DEPTH = 2
-IO_SIZE = 16 * MEGA_BYTE
-IO_PARALLEL = 2
-
-
-def _skip_if_no_aio():
-    if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
-        pytest.skip('Skip tests since async-io is not compatible')
-
-
-def _do_ref_write(tmpdir, index=0):
-    file_suffix = f'{dist.get_rank()}_{index}'
-    ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
-    ref_buffer = os.urandom(IO_SIZE)
-    with open(ref_file, 'wb') as f:
-        f.write(ref_buffer)
-
-    return ref_file, ref_buffer
-
-
-def _get_test_file_and_buffer(tmpdir, ref_buffer, cuda_device, index=0):
-    file_suffix = f'{dist.get_rank()}_{index}'
-    test_file = os.path.join(tmpdir, f'_aio_write_random_{file_suffix}.pt')
-    if cuda_device:
-        test_buffer = torch.cuda.ByteTensor(list(ref_buffer))
-    else:
-        test_buffer = torch.ByteTensor(list(ref_buffer)).pin_memory()
-
-    return test_file, test_buffer
-
-
-def _validate_handle_state(handle, single_submit, overlap_events):
-    assert handle.get_single_submit() == single_submit
-    assert handle.get_overlap_events() == overlap_events
-    assert handle.get_thread_count() == IO_PARALLEL
-    assert handle.get_block_size() == BLOCK_SIZE
-    assert handle.get_queue_depth() == QUEUE_DEPTH
-
-
-@pytest.mark.parametrize('single_submit, overlap_events',
-                         [(False,
-                           False),
-                          (False,
-                           True),
-                          (True,
-                           False),
-                          (True,
-                           True)])
-def test_parallel_read(tmpdir, single_submit, overlap_events):
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_parallel_read(single_submit, overlap_events):
-        ref_file, _ = _do_ref_write(tmpdir)
-
-        aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory()
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        read_status = h.sync_pread(aio_buffer, ref_file)
-        assert read_status == 1
-
-        with open(ref_file, 'rb') as f:
-            ref_buffer = list(f.read())
-        assert ref_buffer == aio_buffer.tolist()
-
-    _test_parallel_read(single_submit, overlap_events)
-
-
-@pytest.mark.parametrize('single_submit, overlap_events, cuda_device',
-                         [(False,
-                           False,
-                           False),
-                          (False,
-                           True,
-                           False),
-                          (True,
-                           False,
-                           False),
-                          (True,
-                           True,
-                           False),
-                          (False,
-                           False,
-                           True),
-                          (True,
-                           True,
-                           True)])
-def test_async_read(tmpdir, single_submit, overlap_events, cuda_device):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_async_read(single_submit, overlap_events, cuda_device):
-        ref_file, _ = _do_ref_write(tmpdir)
-
-        if cuda_device:
-            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
-        else:
-            aio_buffer = torch.empty(IO_SIZE,
-                                     dtype=torch.uint8,
-                                     device='cpu').pin_memory()
-
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        read_status = h.async_pread(aio_buffer, ref_file)
-        assert read_status == 0
-
-        wait_status = h.wait()
-        assert wait_status == 1
-
-        with open(ref_file, 'rb') as f:
-            ref_buffer = list(f.read())
-        assert ref_buffer == aio_buffer.tolist()
-
-    _test_async_read(single_submit, overlap_events, cuda_device)
-
-
-@pytest.mark.parametrize('single_submit, overlap_events',
-                         [(False,
-                           False),
-                          (False,
-                           True),
-                          (True,
-                           False),
-                          (True,
-                           True)])
-def test_parallel_write(tmpdir, single_submit, overlap_events):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_parallel_write(single_submit, overlap_events):
-        ref_file, ref_buffer = _do_ref_write(tmpdir)
-
-        aio_file, aio_buffer = _get_test_file_and_buffer(tmpdir, ref_buffer, False)
-
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        write_status = h.sync_pwrite(aio_buffer, aio_file)
-        assert write_status == 1
-
-        assert os.path.isfile(aio_file)
-
-        filecmp.clear_cache()
-        assert filecmp.cmp(ref_file, aio_file, shallow=False)
-
-    _test_parallel_write(single_submit, overlap_events)
-
-
-@pytest.mark.parametrize('single_submit, overlap_events, cuda_device',
-                         [(False,
-                           False,
-                           False),
-                          (False,
-                           True,
-                           False),
-                          (True,
-                           False,
-                           False),
-                          (True,
-                           True,
-                           False),
-                          (False,
-                           False,
-                           True),
-                          (True,
-                           True,
-                           True)])
-def test_async_write(tmpdir, single_submit, overlap_events, cuda_device):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_async_write(single_submit, overlap_events, cuda_device):
-        ref_file, ref_buffer = _do_ref_write(tmpdir)
-
-        aio_file, aio_buffer = _get_test_file_and_buffer(tmpdir, ref_buffer, cuda_device)
-
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        write_status = h.async_pwrite(aio_buffer, aio_file)
-        assert write_status == 0
-
-        wait_status = h.wait()
-        assert wait_status == 1
-
-        assert os.path.isfile(aio_file)
-
-        filecmp.clear_cache()
-        assert filecmp.cmp(ref_file, aio_file, shallow=False)
-
-    _test_async_write(single_submit, overlap_events, cuda_device)
-
-
-@pytest.mark.parametrize('async_queue, cuda_device',
-                         [(2,
-                           False),
-                          (4,
-                           False),
-                          (2,
-                           True),
-                          (4,
-                           True)])
-def test_async_queue_read(tmpdir, async_queue, cuda_device):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_async_queue_read(async_queue, cuda_device):
-        ref_files = []
-        for i in range(async_queue):
-            f, _ = _do_ref_write(tmpdir, i)
-            ref_files.append(f)
-
-        aio_buffers = []
-        for i in range(async_queue):
-            if cuda_device:
-                buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
-            else:
-                buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory()
-            aio_buffers.append(buf)
-
-        single_submit = True
-        overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        for i in range(async_queue):
-            read_status = h.async_pread(aio_buffers[i], ref_files[i])
-            assert read_status == 0
-
-        wait_status = h.wait()
-        assert wait_status == async_queue
-
-        for i in range(async_queue):
-            with open(ref_files[i], 'rb') as f:
-                ref_buffer = list(f.read())
-            assert ref_buffer == aio_buffers[i].tolist()
-
-    _test_async_queue_read(async_queue, cuda_device)
-
-
-@pytest.mark.parametrize('async_queue, cuda_device',
-                         [(2,
-                           False),
-                          (7,
-                           False),
-                          (2,
-                           True),
-                          (7,
-                           True)])
-def test_async_queue_write(tmpdir, async_queue, cuda_device):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_async_queue_write(async_queue, cuda_device):
-        ref_files = []
-        ref_buffers = []
-        for i in range(async_queue):
-            f, buf = _do_ref_write(tmpdir, i)
-            ref_files.append(f)
-            ref_buffers.append(buf)
-
-        aio_files = []
-        aio_buffers = []
-        for i in range(async_queue):
-            f, buf = _get_test_file_and_buffer(tmpdir, ref_buffers[i], cuda_device, i)
-            aio_files.append(f)
-            aio_buffers.append(buf)
-
-        single_submit = True
-        overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        for i in range(async_queue):
-            read_status = h.async_pwrite(aio_buffers[i], aio_files[i])
-            assert read_status == 0
-
-        wait_status = h.wait()
-        assert wait_status == async_queue
-
-        for i in range(async_queue):
-            assert os.path.isfile(aio_files[i])
-
-            filecmp.clear_cache()
-            assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False)
-
-    _test_async_queue_write(async_queue, cuda_device)
+import pytest
+import os
+import filecmp
+import torch
+import deepspeed
+import torch.distributed as dist
+from common import distributed_test
+from deepspeed.ops.aio import AsyncIOBuilder
+
+MEGA_BYTE = 1024**2
+BLOCK_SIZE = MEGA_BYTE
+QUEUE_DEPTH = 2
+IO_SIZE = 16 * MEGA_BYTE
+IO_PARALLEL = 2
+
+
+def _skip_if_no_aio():
+    if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
+        pytest.skip('Skip tests since async-io is not compatible')
+
+
+def _do_ref_write(tmpdir, index=0):
+    file_suffix = f'{dist.get_rank()}_{index}'
+    ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
+    ref_buffer = os.urandom(IO_SIZE)
+    with open(ref_file, 'wb') as f:
+        f.write(ref_buffer)
+
+    return ref_file, ref_buffer
+
+
+def _get_test_file_and_buffer(tmpdir, ref_buffer, cuda_device, index=0):
+    file_suffix = f'{dist.get_rank()}_{index}'
+    test_file = os.path.join(tmpdir, f'_aio_write_random_{file_suffix}.pt')
+    if cuda_device:
+        test_buffer = torch.cuda.ByteTensor(list(ref_buffer))
+    else:
+        test_buffer = torch.ByteTensor(list(ref_buffer)).pin_memory()
+
+    return test_file, test_buffer
+
+
+def _validate_handle_state(handle, single_submit, overlap_events):
+    assert handle.get_single_submit() == single_submit
+    assert handle.get_overlap_events() == overlap_events
+    assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_block_size() == BLOCK_SIZE
+    assert handle.get_queue_depth() == QUEUE_DEPTH
+
+
+@pytest.mark.parametrize('single_submit, overlap_events',
+                         [(False,
+                           False),
+                          (False,
+                           True),
+                          (True,
+                           False),
+                          (True,
+                           True)])
+def test_parallel_read(tmpdir, single_submit, overlap_events):
+    _skip_if_no_aio()
+
+    @distributed_test(world_size=[2])
+    def _test_parallel_read(single_submit, overlap_events):
+        ref_file, _ = _do_ref_write(tmpdir)
+
+        aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory()
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        read_status = h.sync_pread(aio_buffer, ref_file)
+        assert read_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == aio_buffer.tolist()
+
+    _test_parallel_read(single_submit, overlap_events)
+
+
+@pytest.mark.parametrize('single_submit, overlap_events, cuda_device',
+                         [(False,
+                           False,
+                           False),
+                          (False,
+                           True,
+                           False),
+                          (True,
+                           False,
+                           False),
+                          (True,
+                           True,
+                           False),
+                          (False,
+                           False,
+                           True),
+                          (True,
+                           True,
+                           True)])
+def test_async_read(tmpdir, single_submit, overlap_events, cuda_device):
+
+    _skip_if_no_aio()
+
+    @distributed_test(world_size=[2])
+    def _test_async_read(single_submit, overlap_events, cuda_device):
+        ref_file, _ = _do_ref_write(tmpdir)
+
+        if cuda_device:
+            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
+        else:
+            aio_buffer = torch.empty(IO_SIZE,
+                                     dtype=torch.uint8,
+                                     device='cpu').pin_memory()
+
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        read_status = h.async_pread(aio_buffer, ref_file)
+        assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == aio_buffer.tolist()
+
+    _test_async_read(single_submit, overlap_events, cuda_device)
+
+
+@pytest.mark.parametrize('single_submit, overlap_events',
+                         [(False,
+                           False),
+                          (False,
+                           True),
+                          (True,
+                           False),
+                          (True,
+                           True)])
+def test_parallel_write(tmpdir, single_submit, overlap_events):
+
+    _skip_if_no_aio()
+
+    @distributed_test(world_size=[2])
+    def _test_parallel_write(single_submit, overlap_events):
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+
+        aio_file, aio_buffer = _get_test_file_and_buffer(tmpdir, ref_buffer, False)
+
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.sync_pwrite(aio_buffer, aio_file)
+        assert write_status == 1
+
+        assert os.path.isfile(aio_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+    _test_parallel_write(single_submit, overlap_events)
+
+
+@pytest.mark.parametrize('single_submit, overlap_events, cuda_device',
+                         [(False,
+                           False,
+                           False),
+                          (False,
+                           True,
+                           False),
+                          (True,
+                           False,
+                           False),
+                          (True,
+                           True,
+                           False),
+                          (False,
+                           False,
+                           True),
+                          (True,
+                           True,
+                           True)])
+def test_async_write(tmpdir, single_submit, overlap_events, cuda_device):
+
+    _skip_if_no_aio()
+
+    @distributed_test(world_size=[2])
+    def _test_async_write(single_submit, overlap_events, cuda_device):
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+
+        aio_file, aio_buffer = _get_test_file_and_buffer(tmpdir, ref_buffer, cuda_device)
+
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.async_pwrite(aio_buffer, aio_file)
+        assert write_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        assert os.path.isfile(aio_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+    _test_async_write(single_submit, overlap_events, cuda_device)
+
+
+@pytest.mark.parametrize('async_queue, cuda_device',
+                         [(2,
+                           False),
+                          (4,
+                           False),
+                          (2,
+                           True),
+                          (4,
+                           True)])
+def test_async_queue_read(tmpdir, async_queue, cuda_device):
+
+    _skip_if_no_aio()
+
+    @distributed_test(world_size=[2])
+    def _test_async_queue_read(async_queue, cuda_device):
+        ref_files = []
+        for i in range(async_queue):
+            f, _ = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+
+        aio_buffers = []
+        for i in range(async_queue):
+            if cuda_device:
+                buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
+            else:
+                buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory()
+            aio_buffers.append(buf)
+
+        single_submit = True
+        overlap_events = True
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pread(aio_buffers[i], ref_files[i])
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        for i in range(async_queue):
+            with open(ref_files[i], 'rb') as f:
+                ref_buffer = list(f.read())
+            assert ref_buffer == aio_buffers[i].tolist()
+
+    _test_async_queue_read(async_queue, cuda_device)
+
+
+@pytest.mark.parametrize('async_queue, cuda_device',
+                         [(2,
+                           False),
+                          (7,
+                           False),
+                          (2,
+                           True),
+                          (7,
+                           True)])
+def test_async_queue_write(tmpdir, async_queue, cuda_device):
+
+    _skip_if_no_aio()
+
+    @distributed_test(world_size=[2])
+    def _test_async_queue_write(async_queue, cuda_device):
+        ref_files = []
+        ref_buffers = []
+        for i in range(async_queue):
+            f, buf = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+            ref_buffers.append(buf)
+
+        aio_files = []
+        aio_buffers = []
+        for i in range(async_queue):
+            f, buf = _get_test_file_and_buffer(tmpdir, ref_buffers[i], cuda_device, i)
+            aio_files.append(f)
+            aio_buffers.append(buf)
+
+        single_submit = True
+        overlap_events = True
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
+                                               QUEUE_DEPTH,
+                                               single_submit,
+                                               overlap_events,
+                                               IO_PARALLEL)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pwrite(aio_buffers[i], aio_files[i])
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        for i in range(async_queue):
+            assert os.path.isfile(aio_files[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False)
+
+    _test_async_queue_write(async_queue, cuda_device)
--- a/tests/unit/test_cpu_adagrad.py
+++ b/tests/unit/test_cpu_adagrad.py
@ -1,125 +1,125 @@
-import torch
-import numpy as np
-import pytest
-
-import deepspeed
-from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
-from deepspeed.ops.op_builder import CPUAdagradBuilder
-
-if not deepspeed.ops.__compatible_ops__[CPUAdagradBuilder.NAME]:
-    pytest.skip("cpu-adagrad is not compatible")
-
-
-def check_equal(first, second, atol=1e-2, verbose=False):
-    x = first.detach().numpy()
-    y = second.detach().numpy()
-    if verbose:
-        print("x = {}".format(x.flatten()))
-        print("y = {}".format(y.flatten()))
-        print('-' * 80)
-    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
-
-
-@pytest.mark.parametrize('model_size',
-                         [
-                             (64),
-                             (22),
-                             (55),
-                             (127),
-                             (1024),
-                             (1048576),
-                             (30000000),
-                         ]) # yapf: disable
-def test_cpu_adagrad_opt(model_size):
-    device = 'cpu'
-    rng_state = torch.get_rng_state()
-    param = torch.nn.Parameter(torch.randn(model_size, device=device))
-    torch.set_rng_state(rng_state)
-    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
-    torch.set_rng_state(rng_state)
-
-    optimizer = DeepSpeedCPUAdagrad([param])
-    optimizer1 = torch.optim.Adagrad([param1])
-
-    for i in range(10):
-        rng_state = torch.get_rng_state()
-        param.grad = torch.randn(model_size, device=device)
-        torch.set_rng_state(rng_state)
-        param1.grad = torch.randn(model_size, device=device)
-        optimizer.step()
-        optimizer1.step()
-
-    check_equal(param, param1, atol=1e-2, verbose=True)
-
-
-@pytest.mark.parametrize('model_size,vocabulary_size,dim',
-                         [
-                             (16 * 2, 16 * 4, 16),
-                             (16 * 32, 16 * 256, 16),
-                             (16 * 256, 16 * 16384, 16),
-                         ]) # yapf: disable
-def test_cpu_adagrad_opt_sparse_embedding(model_size, vocabulary_size, dim):
-    device = 'cpu'
-    rng_state = torch.get_rng_state()
-
-    def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
-        i = torch.randint(vocabulary_size,
-                          size=(1,
-                                num_indices),
-                          dtype=torch.int64,
-                          device=device)
-        v = torch.randn(num_indices, dim, dtype=dtype, device=device)
-        t = torch.sparse_coo_tensor(i, v, (vocabulary_size, dim), device=device)
-        t = t.coalesce()
-        new_i = (t.indices().view(-1,
-                                  1).repeat(1,
-                                            dim) * dim +
-                 torch.tensor(range(dim))).flatten().unsqueeze(0)
-        new_v = t.values().flatten()
-        new_t = torch.sparse_coo_tensor(new_i,
-                                        new_v,
-                                        (vocabulary_size * dim,
-                                         ),
-                                        device=device)
-        new_t = new_t.coalesce()
-        new_t.requires_grad = False
-        return new_t
-
-    voc_size = vocabulary_size
-    dim = dim
-    num_indices = int(model_size // dim)
-    dtype = torch.float32
-
-    param = torch.nn.Parameter(torch.randn((voc_size * dim,
-                                            ),
-                                           dtype=dtype,
-                                           device=device),
-                               requires_grad=True)
-    torch.set_rng_state(rng_state)
-    param1 = torch.nn.Parameter(torch.randn((voc_size * dim,
-                                             ),
-                                            dtype=dtype,
-                                            device=device),
-                                requires_grad=True)
-    torch.set_rng_state(rng_state)
-
-    optimizer = DeepSpeedCPUAdagrad([param])
-    optimizer1 = torch.optim.Adagrad([param1])
-
-    for i in range(10):
-        torch.set_rng_state(rng_state)
-        param.grad = gen_sparse_grad(voc_size,
-                                     dim,
-                                     num_indices,
-                                     dtype=dtype,
-                                     device=device)
-        torch.set_rng_state(rng_state)
-        param1.grad = gen_sparse_grad(voc_size,
-                                      dim,
-                                      num_indices,
-                                      dtype=dtype,
-                                      device=device)
-        optimizer.step()
-        optimizer1.step()
-
-    check_equal(param, param1, atol=1e-2, verbose=True)
+import torch
+import numpy as np
+import pytest
+
+import deepspeed
+from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
+from deepspeed.ops.op_builder import CPUAdagradBuilder
+
+if not deepspeed.ops.__compatible_ops__[CPUAdagradBuilder.NAME]:
+    pytest.skip("cpu-adagrad is not compatible")
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    x = first.detach().numpy()
+    y = second.detach().numpy()
+    if verbose:
+        print("x = {}".format(x.flatten()))
+        print("y = {}".format(y.flatten()))
+        print('-' * 80)
+    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
+
+
+@pytest.mark.parametrize('model_size',
+                         [
+                             (64),
+                             (22),
+                             (55),
+                             (127),
+                             (1024),
+                             (1048576),
+                             (30000000),
+                         ]) # yapf: disable
+def test_cpu_adagrad_opt(model_size):
+    device = 'cpu'
+    rng_state = torch.get_rng_state()
+    param = torch.nn.Parameter(torch.randn(model_size, device=device))
+    torch.set_rng_state(rng_state)
+    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
+    torch.set_rng_state(rng_state)
+
+    optimizer = DeepSpeedCPUAdagrad([param])
+    optimizer1 = torch.optim.Adagrad([param1])
+
+    for i in range(10):
+        rng_state = torch.get_rng_state()
+        param.grad = torch.randn(model_size, device=device)
+        torch.set_rng_state(rng_state)
+        param1.grad = torch.randn(model_size, device=device)
+        optimizer.step()
+        optimizer1.step()
+
+    check_equal(param, param1, atol=1e-2, verbose=True)
+
+
+@pytest.mark.parametrize('model_size,vocabulary_size,dim',
+                         [
+                             (16 * 2, 16 * 4, 16),
+                             (16 * 32, 16 * 256, 16),
+                             (16 * 256, 16 * 16384, 16),
+                         ]) # yapf: disable
+def test_cpu_adagrad_opt_sparse_embedding(model_size, vocabulary_size, dim):
+    device = 'cpu'
+    rng_state = torch.get_rng_state()
+
+    def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
+        i = torch.randint(vocabulary_size,
+                          size=(1,
+                                num_indices),
+                          dtype=torch.int64,
+                          device=device)
+        v = torch.randn(num_indices, dim, dtype=dtype, device=device)
+        t = torch.sparse_coo_tensor(i, v, (vocabulary_size, dim), device=device)
+        t = t.coalesce()
+        new_i = (t.indices().view(-1,
+                                  1).repeat(1,
+                                            dim) * dim +
+                 torch.tensor(range(dim))).flatten().unsqueeze(0)
+        new_v = t.values().flatten()
+        new_t = torch.sparse_coo_tensor(new_i,
+                                        new_v,
+                                        (vocabulary_size * dim,
+                                         ),
+                                        device=device)
+        new_t = new_t.coalesce()
+        new_t.requires_grad = False
+        return new_t
+
+    voc_size = vocabulary_size
+    dim = dim
+    num_indices = int(model_size // dim)
+    dtype = torch.float32
+
+    param = torch.nn.Parameter(torch.randn((voc_size * dim,
+                                            ),
+                                           dtype=dtype,
+                                           device=device),
+                               requires_grad=True)
+    torch.set_rng_state(rng_state)
+    param1 = torch.nn.Parameter(torch.randn((voc_size * dim,
+                                             ),
+                                            dtype=dtype,
+                                            device=device),
+                                requires_grad=True)
+    torch.set_rng_state(rng_state)
+
+    optimizer = DeepSpeedCPUAdagrad([param])
+    optimizer1 = torch.optim.Adagrad([param1])
+
+    for i in range(10):
+        torch.set_rng_state(rng_state)
+        param.grad = gen_sparse_grad(voc_size,
+                                     dim,
+                                     num_indices,
+                                     dtype=dtype,
+                                     device=device)
+        torch.set_rng_state(rng_state)
+        param1.grad = gen_sparse_grad(voc_size,
+                                      dim,
+                                      num_indices,
+                                      dtype=dtype,
+                                      device=device)
+        optimizer.step()
+        optimizer1.step()
+
+    check_equal(param, param1, atol=1e-2, verbose=True)
--- a/tests/unit/test_cpu_adam.py
+++ b/tests/unit/test_cpu_adam.py
@ -1,62 +1,62 @@
-import argparse
-import torch
-import time
-import numpy as np
-import pytest
-import copy
-
-import deepspeed
-from deepspeed.ops.adam import FusedAdam
-from deepspeed.ops.op_builder import CPUAdamBuilder
-
-if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-    pytest.skip("cpu-adam is not compatible")
-
-
-def check_equal(first, second, atol=1e-2, verbose=False):
-    x = first.detach().numpy()
-    y = second.detach().numpy()
-    if verbose:
-        print("x = {}".format(x.flatten()))
-        print("y = {}".format(y.flatten()))
-        print('-' * 80)
-    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
-
-@pytest.mark.parametrize('model_size',
-                         [
-                             (64),
-                             (22),
-                             (55),
-                             (127),
-                             (1024),
-                             (1048576),
-                         ]) # yapf: disable
-def test_cpu_adam_opt(model_size):
-    from deepspeed.ops.adam import DeepSpeedCPUAdam
-    device = 'cpu'
-    rng_state = torch.get_rng_state()
-    param = torch.nn.Parameter(torch.randn(model_size, device=device))
-    torch.set_rng_state(rng_state)
-    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
-    torch.set_rng_state(rng_state)
-    param2_data = torch.randn(model_size, device=device).cuda()
-    param2 = torch.nn.Parameter(param2_data)
-
-    optimizer1 = torch.optim.AdamW([param1])
-    optimizer2 = FusedAdam([param2])
-    optimizer = DeepSpeedCPUAdam([param])
-
-    for i in range(10):
-        rng_state = torch.get_rng_state()
-        param.grad = torch.randn(model_size, device=device)
-        torch.set_rng_state(rng_state)
-        param1.grad = torch.randn(model_size, device=device)
-        torch.set_rng_state(rng_state)
-        param2.grad = torch.randn(model_size, device=device).cuda()
-
-        optimizer.step()
-        optimizer2.step()
-        optimizer1.step()
-
-    check_equal(param, param1, atol=1e-2, verbose=True)
-    check_equal(param, param2.cpu(), atol=1e-2, verbose=True)
+import argparse
+import torch
+import time
+import numpy as np
+import pytest
+import copy
+
+import deepspeed
+from deepspeed.ops.adam import FusedAdam
+from deepspeed.ops.op_builder import CPUAdamBuilder
+
+if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+    pytest.skip("cpu-adam is not compatible")
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    x = first.detach().numpy()
+    y = second.detach().numpy()
+    if verbose:
+        print("x = {}".format(x.flatten()))
+        print("y = {}".format(y.flatten()))
+        print('-' * 80)
+    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
+
+@pytest.mark.parametrize('model_size',
+                         [
+                             (64),
+                             (22),
+                             (55),
+                             (127),
+                             (1024),
+                             (1048576),
+                         ]) # yapf: disable
+def test_cpu_adam_opt(model_size):
+    from deepspeed.ops.adam import DeepSpeedCPUAdam
+    device = 'cpu'
+    rng_state = torch.get_rng_state()
+    param = torch.nn.Parameter(torch.randn(model_size, device=device))
+    torch.set_rng_state(rng_state)
+    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
+    torch.set_rng_state(rng_state)
+    param2_data = torch.randn(model_size, device=device).cuda()
+    param2 = torch.nn.Parameter(param2_data)
+
+    optimizer1 = torch.optim.AdamW([param1])
+    optimizer2 = FusedAdam([param2])
+    optimizer = DeepSpeedCPUAdam([param])
+
+    for i in range(10):
+        rng_state = torch.get_rng_state()
+        param.grad = torch.randn(model_size, device=device)
+        torch.set_rng_state(rng_state)
+        param1.grad = torch.randn(model_size, device=device)
+        torch.set_rng_state(rng_state)
+        param2.grad = torch.randn(model_size, device=device).cuda()
+
+        optimizer.step()
+        optimizer2.step()
+        optimizer1.step()
+
+    check_equal(param, param1, atol=1e-2, verbose=True)
+    check_equal(param, param2.cpu(), atol=1e-2, verbose=True)
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
--- a/tests/unit/test_pld.py
+++ b/tests/unit/test_pld.py
@ -1,117 +1,117 @@
-import numpy as np
-import deepspeed
-import pytest
-from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
-from common import distributed_test
-from simple_model import SimpleModel, PLD_SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
-
-
-@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
-def test_pld_schedule(tmpdir, theta):
-    gamma = 0.001
-
-    pld_scheduler = ProgressiveLayerDrop(theta, gamma)
-    for i in range(10):
-        pld_scheduler.update_state(i)
-        expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
-        actual_theta = pld_scheduler.get_theta()
-        assert expected_theta == actual_theta
-
-
-@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
-def test_pld_model(tmpdir, theta):
-    gamma = 0.001
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": 'Adam',
-            "params": {
-                "lr": 0.0001
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "progressive_layer_drop": {
-            "enabled": True,
-            "theta": theta,
-            "gamma": gamma
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = PLD_SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_pld_model(args, model, hidden_dim, theta, gamma):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        for i, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-            expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
-            actual_theta = model.get_pld_theta()
-            assert expected_theta == actual_theta
-
-    _test_pld_model(args=args,
-                    model=model,
-                    hidden_dim=hidden_dim,
-                    theta=theta,
-                    gamma=gamma)
-
-
-def test_non_pld_model(tmpdir):
-    gamma = 0.001
-    theta = 0.5
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": 'Adam',
-            "params": {
-                "lr": 0.0001
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "progressive_layer_drop": {
-            "enabled": True,
-            "theta": theta,
-            "gamma": gamma
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_non_pld_model(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        for i, batch in enumerate(data_loader):
-            with pytest.raises(TypeError):
-                loss = model(batch[0], batch[1])
-
-    _test_non_pld_model(args=args, model=model, hidden_dim=hidden_dim)
+import numpy as np
+import deepspeed
+import pytest
+from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
+from common import distributed_test
+from simple_model import SimpleModel, PLD_SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
+
+
+@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
+def test_pld_schedule(tmpdir, theta):
+    gamma = 0.001
+
+    pld_scheduler = ProgressiveLayerDrop(theta, gamma)
+    for i in range(10):
+        pld_scheduler.update_state(i)
+        expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
+        actual_theta = pld_scheduler.get_theta()
+        assert expected_theta == actual_theta
+
+
+@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
+def test_pld_model(tmpdir, theta):
+    gamma = 0.001
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": 'Adam',
+            "params": {
+                "lr": 0.0001
+            }
+        },
+        "fp16": {
+            "enabled": True
+        },
+        "progressive_layer_drop": {
+            "enabled": True,
+            "theta": theta,
+            "gamma": gamma
+        }
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = PLD_SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_pld_model(args, model, hidden_dim, theta, gamma):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+            expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
+            actual_theta = model.get_pld_theta()
+            assert expected_theta == actual_theta
+
+    _test_pld_model(args=args,
+                    model=model,
+                    hidden_dim=hidden_dim,
+                    theta=theta,
+                    gamma=gamma)
+
+
+def test_non_pld_model(tmpdir):
+    gamma = 0.001
+    theta = 0.5
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": 'Adam',
+            "params": {
+                "lr": 0.0001
+            }
+        },
+        "fp16": {
+            "enabled": True
+        },
+        "progressive_layer_drop": {
+            "enabled": True,
+            "theta": theta,
+            "gamma": gamma
+        }
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_non_pld_model(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=1,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            with pytest.raises(TypeError):
+                loss = model(batch[0], batch[1])
+
+    _test_non_pld_model(args=args, model=model, hidden_dim=hidden_dim)